From ccec25589d7178be755959bf539016df1f66017f Mon Sep 17 00:00:00 2001 From: Marcin Wojdyr Date: Wed, 6 Sep 2023 20:50:29 +0200 Subject: [PATCH] move calculation of E (normalized F) to the library add python bindings modify Binner::setup() --- docs/ecalc-help.txt | 24 ++++++++++ docs/headers.rst | 3 ++ docs/utils.rst | 9 ++++ include/gemmi/binner.hpp | 38 ++++++++-------- include/gemmi/ecalc.hpp | 97 ++++++++++++++++++++++++++++++++++++++++ include/gemmi/stats.hpp | 7 --- prog/ecalc.cpp | 93 ++++++-------------------------------- python/hkl.cpp | 16 +++++-- 8 files changed, 178 insertions(+), 109 deletions(-) create mode 100644 docs/ecalc-help.txt create mode 100644 include/gemmi/ecalc.hpp diff --git a/docs/ecalc-help.txt b/docs/ecalc-help.txt new file mode 100644 index 000000000..7cac37660 --- /dev/null +++ b/docs/ecalc-help.txt @@ -0,0 +1,24 @@ +$ gemmi ecalc -h +Usage: + gemmi ecalc [options] INPUT.mtz OUTPUT.mtz + +Calculates E (normalised structure amplitudes) from F. + -h, --help Print usage and exit. + -V, --version Print version and exit. + -v, --verbose Verbose output. + --F=LABEL Label of the input column (default: F). + --E=LABEL Label of the output column (default: E). + --no-sigma Do not use sigma columns (SIGF->SIGE). + --method=METHOD Method of calculating , one of: + ma - moving average (not implemented yet), + 2 - resolution bins equispaced in 1/d^2, + 3 - resolution bins in 1/d^3 (default), + ec - bins with equal count of reflections. + --binsize=N Number of reflections per bin or in moving window + (default: 200). + --maxbins=N Maximum number of bins (default: 100). + +Column for SIGF is always the next column after F (the type must be Q). +If INPUT.mtz has column E, it is replaced in OUTPUT.mtz. +Otherwise, a new column is appended. +The name for SIGE, if it is added as a column, is 'SIG' + label of E. diff --git a/docs/headers.rst b/docs/headers.rst index ec3ee26c6..bec625f71 100644 --- a/docs/headers.rst +++ b/docs/headers.rst @@ -85,6 +85,9 @@ gemmi/dirwalk.hpp in an alphabetical order. It wraps the tinydir library (as we cannot depend on C++17 yet). +gemmi/ecalc.hpp + Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC). + gemmi/eig3.hpp Eigen decomposition code for symmetric 3x3 matrices. diff --git a/docs/utils.rst b/docs/utils.rst index 6e1bc2f3d..7413a4f8b 100644 --- a/docs/utils.rst +++ b/docs/utils.rst @@ -454,6 +454,15 @@ Merge intensities from multi-record reflection file. .. literalinclude:: merge-help.txt :language: console +ecalc +===== + +Calculates normalized amplitudes E from amplitudes F. +Uses "Karle" approach, similar to CCP4 ECALC. + +.. literalinclude:: ecalc-help.txt + :language: console + .. _sfcalc: sfcalc diff --git a/include/gemmi/binner.hpp b/include/gemmi/binner.hpp index a866ffd8f..5bc337080 100644 --- a/include/gemmi/binner.hpp +++ b/include/gemmi/binner.hpp @@ -80,27 +80,26 @@ struct Binner { } template - int setup(int nbins, Method method, const DataProxy& proxy, - const UnitCell* cell_=nullptr) { + void setup(int nbins, Method method, const DataProxy& proxy, + const UnitCell* cell_=nullptr, bool with_mids=false, size_t col_idx=0) { + if (col_idx >= proxy.stride()) + fail("wrong col_idx in Binner::setup()"); cell = cell_ ? *cell_ : proxy.unit_cell(); - std::vector inv_d2(proxy.size() / proxy.stride()); - for (size_t i = 0, offset = 0; i < inv_d2.size(); ++i, offset += proxy.stride()) - inv_d2[i] = cell.calculate_1_d2(proxy.get_hkl(offset)); - return setup_from_1_d2(nbins, method, std::move(inv_d2), nullptr); - } - - /// the same as setup(), but returns mid values of 1/d^2. - template - std::vector setup_mid(int nbins, Method method, const DataProxy& proxy, - const UnitCell* cell_=nullptr) { - setup(2 * nbins, method, proxy, cell_); - std::vector mids(nbins); - for (int i = 0; i < nbins; ++i) { - mids[i] = limits[2*i]; - limits[i] = limits[2*i+1]; + std::vector inv_d2; + inv_d2.reserve(proxy.size() / proxy.stride()); + for (size_t offset = 0; offset < proxy.size(); offset += proxy.stride()) + if (col_idx == 0 || !std::isnan(proxy.get_num(offset + col_idx))) + inv_d2.push_back(cell.calculate_1_d2(proxy.get_hkl(offset))); + setup_from_1_d2((with_mids ? 2 * nbins : nbins), + method, std::move(inv_d2), nullptr); + if (with_mids) { + mids.resize(nbins); + for (int i = 0; i < nbins; ++i) { + mids[i] = limits[2*i]; + limits[i] = limits[2*i+1]; + } + limits.resize(nbins); } - limits.resize(nbins); - return mids; } void ensure_limits_are_set() const { @@ -172,6 +171,7 @@ struct Binner { double min_1_d2; double max_1_d2; std::vector limits; // upper limit of each bin + std::vector mids; // the middle of each bin }; inline Correlation combine_two_correlations(const Correlation& a, const Correlation& b) { diff --git a/include/gemmi/ecalc.hpp b/include/gemmi/ecalc.hpp new file mode 100644 index 000000000..9da667d98 --- /dev/null +++ b/include/gemmi/ecalc.hpp @@ -0,0 +1,97 @@ +// Copyright 2023 Global Phasing Ltd. +// +// Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC). + +#ifndef GEMMI_ECALC_HPP_ +#define GEMMI_ECALC_HPP_ + +#include +#include "binner.hpp" + +namespace gemmi { + +template +std::vector calculate_amplitude_normalizers(const DataProxy& data, int fcol_idx, + const Binner& binner) { + struct CountAndSum { + int n = 0; + double sum = 0.; + }; + int nreflections = data.size() / data.stride(); + std::vector multipliers(nreflections, NAN); + if (data.spacegroup() == nullptr) + gemmi::fail("unknown space group in the data file"); + GroupOps gops = data.spacegroup()->operations(); + std::vector inv_d2(multipliers.size()); + for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), ++i) + inv_d2[i] = data.unit_cell().calculate_1_d2(data.get_hkl(n)); + std::vector bin_index = binner.get_bins_from_1_d2(inv_d2); + std::vector stats(binner.size()); + for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), i++) { + Miller hkl = data.get_hkl(n); + double f = data.get_num(n + fcol_idx); + if (!std::isnan(f)) { + int epsilon = gops.epsilon_factor(hkl); + double inv_epsilon = 1.0 / epsilon; + double f2 = f * f * inv_epsilon; + multipliers[i] = std::sqrt(inv_epsilon); + CountAndSum& cs = stats[bin_index[i]]; + cs.n++; + cs.sum += f2; + } + } + + // simple smoothing with kernel [0.75 1 0.75] + std::vector smoothed(stats.size()); + { + const double k = 0.75; + smoothed[0] = (stats[0].sum + k * stats[1].sum) / (stats[0].n + k * stats[1].n); + size_t n = stats.size() - 1; + for (size_t i = 1; i < n; ++i) + smoothed[i] = (stats[i].sum + k * (stats[i-1].sum + stats[i+1].sum)) + / (stats[i].n + k * (stats[i-1].n + stats[i+1].n)); + smoothed[n] = (stats[n].sum + k * stats[n-1].sum) / (stats[n].n + k * stats[n-1].n); + } + +#if 0 + { + // print shell statistics + std::vector refl_counts(binner.size()); + printf(" shell\t #F\t d\t \tsmoothd\t #refl\t mid d\n"); + for (int idx : bin_index) + ++refl_counts[idx]; + for (size_t i = 0; i < binner.size(); ++i) { + double d = 1 / std::sqrt(binner.limits[i]); + double mid_d = 1 / std::sqrt(binner.mids[i]); + double avg_f2 = stats[i].sum / stats[i].n; + printf("%6zu\t%6d\t%7.3f\t%7.0f\t%7.0f\t%6d\t%7.3f\n", + i+1, stats[i].n, d, avg_f2, smoothed[i], refl_counts[i], mid_d); + } + printf("\n"); + } +#endif + + for (double& x : smoothed) + x = std::sqrt(x); + for (size_t i = 0; i < multipliers.size(); ++i) { + double x = inv_d2[i]; + int bin = bin_index[i]; + double rms = smoothed[bin]; + if (x > binner.mids.front() && x < binner.mids.back()) { + // linear interpolation in 1/d^2 + if (x > binner.mids[bin]) + ++bin; + double x0 = binner.mids[bin - 1]; + double x1 = binner.mids[bin]; + double y0 = smoothed[bin - 1]; + double y1 = smoothed[bin]; + assert(x0 <= x && x <= x1); + rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0); + } + multipliers[i] /= rms; + } + return multipliers; +} + +} // namespace gemmi +#endif diff --git a/include/gemmi/stats.hpp b/include/gemmi/stats.hpp index f2c7d7291..650fee3e2 100644 --- a/include/gemmi/stats.hpp +++ b/include/gemmi/stats.hpp @@ -11,13 +11,6 @@ namespace gemmi { -struct Mean { - int n = 0; - double sum = 0.; - void add_point(double x) { ++n; sum += x; } - double get_mean() const { return sum / n; } -}; - // popular single-pass algorithm for calculating variance and mean struct Variance { int n = 0; diff --git a/prog/ecalc.cpp b/prog/ecalc.cpp index 187d28385..4cd3dca76 100644 --- a/prog/ecalc.cpp +++ b/prog/ecalc.cpp @@ -1,10 +1,9 @@ // Copyright 2023 Global Phasing Ltd. #include // for printf, fprintf -#include // for atof #include "gemmi/binner.hpp" // for Binner #include "gemmi/mtz.hpp" // for Mtz -#include "gemmi/stats.hpp" // for Mean +#include "gemmi/ecalc.hpp" // for Mean #define GEMMI_PROG ecalc #include "options.h" @@ -17,7 +16,7 @@ enum OptionIndex { struct EcalcArg: public Arg { static option::ArgStatus MethodChoice(const option::Option& option, bool msg) { - return Arg::Choice(option, msg, {"ma", "2", "3", "ec"}); + return Arg::Choice(option, msg, {/*"ma",*/ "2", "3", "ec"}); } }; @@ -49,8 +48,7 @@ const option::Descriptor Usage[] = { "\nColumn for SIGF is always the next column after F (the type must be Q)." "\nIf INPUT.mtz has column E, it is replaced in OUTPUT.mtz." "\nOtherwise, a new column is appended." - "\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E." - "\nTo print the list of resolution shells add -vv." }, + "\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E." }, { 0, 0, 0, 0, 0, 0 } }; @@ -72,8 +70,6 @@ int GEMMI_MAIN(int argc, char **argv) { try { Mtz mtz; mtz.read_file_gz(input); - if (mtz.spacegroup == nullptr) - gemmi::fail("Unknown space group of the input data: ", input); const Mtz::Column* fcol = &mtz.get_column_with_label(f_label); if (use_sigma && !fcol->get_next_column_if_type('Q')) gemmi::fail("Column ", f_label, " not followed by sigma column. Use --no-sigma.\n"); @@ -100,7 +96,6 @@ int GEMMI_MAIN(int argc, char **argv) { } if (!mtz.has_data()) // shouldn't happen gemmi::fail("no data"); - gemmi::Binner binner; int f_count = 0; for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size()) if (!std::isnan(mtz.data[n + fcol_idx])) @@ -114,90 +109,28 @@ int GEMMI_MAIN(int argc, char **argv) { mtz.nreflections, f_count, nbins); if (nbins < 3) gemmi::fail("not enough resolution bins"); - bool use_moving_average = false; + //bool use_moving_average = false; auto method = gemmi::Binner::Method::Dstar3; if (p.options[Method]) switch (p.options[Method].arg[0]) { - case '2': method = gemmi::Binner::Method::Dstar3; break; + case '2': method = gemmi::Binner::Method::Dstar2; break; case 'e': method = gemmi::Binner::Method::EqualCount; break; - case 'm': use_moving_average = true; break; + //case 'm': use_moving_average = true; break; } if (verbose > 0) std::fprintf(stderr, "Calculating E ...\n"); - gemmi::GroupOps gops = mtz.spacegroup->operations(); - std::vector computed_values(mtz.nreflections, NAN); - if (use_moving_average) { - //TODO - } else { - std::vector mids = binner.setup_mid(nbins, method, gemmi::MtzDataProxy{mtz}); - std::vector inv_d2(mtz.nreflections); - for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i) - inv_d2[i] = mtz.cell.calculate_1_d2(mtz.get_hkl(n)); - std::vector bin_index = binner.get_bins_from_1_d2(inv_d2); - std::vector denom(binner.size()); - for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), i++) { - gemmi::Miller hkl = mtz.get_hkl(n); - double f = mtz.data[n + fcol_idx]; - if (!std::isnan(f)) { - double inv_epsilon = 1.0 / gops.epsilon_factor(hkl); - double f2 = f * f * inv_epsilon; - computed_values[i] = std::sqrt(inv_epsilon); - denom[bin_index[i]].add_point(f2); - } - } - - // simple smoothing with kernel [0.75 1 0.75] - std::vector smoothed(denom.size()); - { - const double k = 0.75; - smoothed[0] = (denom[0].sum + k * denom[1].sum) / (denom[0].n + k * denom[1].n); - size_t n = denom.size() - 1; - for (size_t i = 1; i < n; ++i) - smoothed[i] = (denom[i].sum + k * (denom[i-1].sum + denom[i+1].sum)) - / (denom[i].n + k * (denom[i-1].n + denom[i+1].n)); - smoothed[n] = (denom[n].sum + k * denom[n-1].sum) / (denom[n].n + k * denom[n-1].n); - } + gemmi::Binner binner; + gemmi::MtzDataProxy data_proxy{mtz}; + binner.setup(nbins, method, data_proxy, nullptr, /*with_mids=*/true, fcol_idx); + std::vector multipliers + = gemmi::calculate_amplitude_normalizers(data_proxy, fcol_idx, binner); - if (verbose > 1) { - // print shell statistics - std::vector refl_counts(binner.size()); - printf(" shell\t #F\t d\t \tsmoothd\t #refl\t mid d\n"); - for (int idx : bin_index) - ++refl_counts[idx]; - for (size_t i = 0; i < binner.size(); ++i) { - double d = 1 / std::sqrt(binner.limits[i]); - double mid_d = 1 / std::sqrt(mids[i]); - printf("%6zu\t%6d\t%6.2f\t%7.0f\t%7.0f\t%6d\t%6.2f\n", - i+1, denom[i].n, d, denom[i].get_mean(), smoothed[i], refl_counts[i], mid_d); - } - printf("\n"); - } - for (double& x : smoothed) - x = std::sqrt(x); - for (size_t i = 0; i < computed_values.size(); ++i) { - double x = inv_d2[i]; - int bin = bin_index[i]; - double rms = smoothed[bin]; - if (x > mids.front() && x < mids.back()) { - // linear interpolation in 1/d^2 - if (x > mids[bin]) - ++bin; - double x0 = mids[bin - 1]; - double x1 = mids[bin]; - double y0 = smoothed[bin - 1]; - double y1 = smoothed[bin]; - assert(x0 <= x && x <= x1); - rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0); - } - computed_values[i] /= rms; - } - } if (verbose > 0) std::fprintf(stderr, "Writing %s ...\n", output); for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i) { - mtz.data[n + ecol.idx] *= float(computed_values[i]); + mtz.data[n + ecol.idx] *= float(multipliers[i]); if (use_sigma) - mtz.data[n + ecol.idx + 1] *= float(computed_values[i]); + mtz.data[n + ecol.idx + 1] *= float(multipliers[i]); } mtz.write_to_file(output); } catch (std::runtime_error& e) { diff --git a/python/hkl.cpp b/python/hkl.cpp index ebd2c5a2f..1fbcc0508 100644 --- a/python/hkl.cpp +++ b/python/hkl.cpp @@ -4,12 +4,13 @@ #include "gemmi/refln.hpp" #include "gemmi/fourier.hpp" // for get_size_for_hkl, get_f_phi_on_grid, ... #include "tostr.hpp" -#include "gemmi/fprime.hpp" +#include "gemmi/fprime.hpp" // for cromer_liberman #include "gemmi/reciproc.hpp" // for count_reflections, make_miller_vector #include "gemmi/cif2mtz.hpp" // for CifToMtz #include "gemmi/mtz2cif.hpp" // for MtzToCif #include "gemmi/merge.hpp" // for Intensities #include "gemmi/binner.hpp" // for Binner +#include "gemmi/ecalc.hpp" // for calculate_amplitude_normalizers #include "common.h" #include "arrvec.h" // py_array_from_vector @@ -269,11 +270,13 @@ void add_hkl(py::module& m) { .def(py::init<>()) .def("setup", [](Binner& self, int nbins, Binner::Method method, const Mtz& mtz, const UnitCell* cell) { - return self.setup(nbins, method, MtzDataProxy{mtz}, cell); + self.setup(nbins, method, MtzDataProxy{mtz}, cell); + return self.size(); // for compatibility with older versions }, py::arg("nbins"), py::arg("method"), py::arg("mtz"), py::arg("cell")=nullptr) .def("setup", [](Binner& self, int nbins, Binner::Method method, const ReflnBlock& r, const UnitCell* cell) { - return self.setup(nbins, method, ReflnDataProxy(r), cell); + self.setup(nbins, method, ReflnDataProxy(r), cell); + return self.size(); // for compatibility with older versions }, py::arg("nbins"), py::arg("method"), py::arg("r"), py::arg("cell")=nullptr) .def("setup", [](Binner& self, int nbins, Binner::Method method, py::array_t hkl, const UnitCell* cell) { @@ -334,6 +337,13 @@ void add_hkl(py::module& m) { m.def("combine_correlations", &combine_correlations); + m.def("calculate_amplitude_normalizers", + [](const Mtz& mtz, const std::string& f_col, const Binner& binner) { + const Mtz::Column& f = mtz.get_column_with_label(f_col); + return py_array_from_vector( + calculate_amplitude_normalizers(MtzDataProxy{mtz}, f.idx, binner)); + }); + py::class_(m, "HklMatch") .def(py::init([](py::array_t hkl, py::array_t ref) {