From ccec25589d7178be755959bf539016df1f66017f Mon Sep 17 00:00:00 2001
From: Marcin Wojdyr <wojdyr@gmail.com>
Date: Wed, 6 Sep 2023 20:50:29 +0200
Subject: [PATCH] move calculation of E (normalized F) to the library

add python bindings

modify Binner::setup()
---
 docs/ecalc-help.txt      | 24 ++++++++++
 docs/headers.rst         |  3 ++
 docs/utils.rst           |  9 ++++
 include/gemmi/binner.hpp | 38 ++++++++--------
 include/gemmi/ecalc.hpp  | 97 ++++++++++++++++++++++++++++++++++++++++
 include/gemmi/stats.hpp  |  7 ---
 prog/ecalc.cpp           | 93 ++++++--------------------------------
 python/hkl.cpp           | 16 +++++--
 8 files changed, 178 insertions(+), 109 deletions(-)
 create mode 100644 docs/ecalc-help.txt
 create mode 100644 include/gemmi/ecalc.hpp
diff --git a/docs/ecalc-help.txt b/docs/ecalc-help.txt
new file mode 100644
index 000000000..7cac37660
--- /dev/null
+++ b/docs/ecalc-help.txt
@@ -0,0 +1,24 @@
+$ gemmi ecalc -h
+Usage:
+ gemmi ecalc [options] INPUT.mtz OUTPUT.mtz
+
+Calculates E (normalised structure amplitudes) from F.
+  -h, --help       Print usage and exit.
+  -V, --version    Print version and exit.
+  -v, --verbose    Verbose output.
+  --F=LABEL        Label of the input column (default: F).
+  --E=LABEL        Label of the output column (default: E).
+  --no-sigma       Do not use sigma columns (SIGF->SIGE).
+  --method=METHOD  Method of calculating <F^2>, one of:
+                   ma - moving average (not implemented yet),
+                   2 - resolution bins equispaced in 1/d^2,
+                   3 - resolution bins in 1/d^3 (default),
+                   ec - bins with equal count of reflections.
+  --binsize=N      Number of reflections per bin or in moving window
+                   (default: 200).
+  --maxbins=N      Maximum number of bins (default: 100).
+
+Column for SIGF is always the next column after F (the type must be Q).
+If INPUT.mtz has column E, it is replaced in OUTPUT.mtz.
+Otherwise, a new column is appended.
+The name for SIGE, if it is added as a column, is 'SIG' + label of E.
diff --git a/docs/headers.rst b/docs/headers.rst
index ec3ee26c6..bec625f71 100644
--- a/docs/headers.rst
+++ b/docs/headers.rst
@@ -85,6 +85,9 @@ gemmi/dirwalk.hpp
     in an alphabetical order.  It wraps the tinydir library (as we cannot
     depend on C++17 <filesystem> yet).
 
+gemmi/ecalc.hpp
+    Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC).
+
 gemmi/eig3.hpp
     Eigen decomposition code for symmetric 3x3 matrices.
 
diff --git a/docs/utils.rst b/docs/utils.rst
index 6e1bc2f3d..7413a4f8b 100644
--- a/docs/utils.rst
+++ b/docs/utils.rst
@@ -454,6 +454,15 @@ Merge intensities from multi-record reflection file.
 .. literalinclude:: merge-help.txt
    :language: console
 
+ecalc
+=====
+
+Calculates normalized amplitudes E from amplitudes F.
+Uses "Karle" approach, similar to CCP4 ECALC.
+
+.. literalinclude:: ecalc-help.txt
+   :language: console
+
 .. _sfcalc:
 
 sfcalc
diff --git a/include/gemmi/binner.hpp b/include/gemmi/binner.hpp
index a866ffd8f..5bc337080 100644
--- a/include/gemmi/binner.hpp
+++ b/include/gemmi/binner.hpp
@@ -80,27 +80,26 @@ struct Binner {
   }
 
   template<typename DataProxy>
-  int setup(int nbins, Method method, const DataProxy& proxy,
-            const UnitCell* cell_=nullptr) {
+  void setup(int nbins, Method method, const DataProxy& proxy,
+             const UnitCell* cell_=nullptr, bool with_mids=false, size_t col_idx=0) {
+    if (col_idx >= proxy.stride())
+      fail("wrong col_idx in Binner::setup()");
     cell = cell_ ? *cell_ : proxy.unit_cell();
-    std::vector<double> inv_d2(proxy.size() / proxy.stride());
-    for (size_t i = 0, offset = 0; i < inv_d2.size(); ++i, offset += proxy.stride())
-      inv_d2[i] = cell.calculate_1_d2(proxy.get_hkl(offset));
-    return setup_from_1_d2(nbins, method, std::move(inv_d2), nullptr);
-  }
-
-  /// the same as setup(), but returns mid values of 1/d^2.
-  template<typename DataProxy>
-  std::vector<double> setup_mid(int nbins, Method method, const DataProxy& proxy,
-                                const UnitCell* cell_=nullptr) {
-    setup(2 * nbins, method, proxy, cell_);
-    std::vector<double> mids(nbins);
-    for (int i = 0; i < nbins; ++i) {
-      mids[i] = limits[2*i];
-      limits[i] = limits[2*i+1];
+    std::vector<double> inv_d2;
+    inv_d2.reserve(proxy.size() / proxy.stride());
+    for (size_t offset = 0; offset < proxy.size(); offset += proxy.stride())
+      if (col_idx == 0 || !std::isnan(proxy.get_num(offset + col_idx)))
+        inv_d2.push_back(cell.calculate_1_d2(proxy.get_hkl(offset)));
+    setup_from_1_d2((with_mids ? 2 * nbins : nbins),
+                    method, std::move(inv_d2), nullptr);
+    if (with_mids) {
+      mids.resize(nbins);
+      for (int i = 0; i < nbins; ++i) {
+        mids[i] = limits[2*i];
+        limits[i] = limits[2*i+1];
+      }
+      limits.resize(nbins);
     }
-    limits.resize(nbins);
-    return mids;
   }
 
   void ensure_limits_are_set() const {
@@ -172,6 +171,7 @@ struct Binner {
   double min_1_d2;
   double max_1_d2;
   std::vector<double> limits;  // upper limit of each bin
+  std::vector<double> mids;    // the middle of each bin
 };
 
 inline Correlation combine_two_correlations(const Correlation& a, const Correlation& b) {
diff --git a/include/gemmi/ecalc.hpp b/include/gemmi/ecalc.hpp
new file mode 100644
index 000000000..9da667d98
--- /dev/null
+++ b/include/gemmi/ecalc.hpp
@@ -0,0 +1,97 @@
+// Copyright 2023 Global Phasing Ltd.
+//
+// Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC).
+
+#ifndef GEMMI_ECALC_HPP_
+#define GEMMI_ECALC_HPP_
+
+#include <cassert>
+#include "binner.hpp"
+
+namespace gemmi {
+
+template<typename DataProxy>
+std::vector<double> calculate_amplitude_normalizers(const DataProxy& data, int fcol_idx,
+                                                    const Binner& binner) {
+  struct CountAndSum {
+    int n = 0;
+    double sum = 0.;
+  };
+  int nreflections = data.size() / data.stride();
+  std::vector<double> multipliers(nreflections, NAN);
+  if (data.spacegroup() == nullptr)
+    gemmi::fail("unknown space group in the data file");
+  GroupOps gops = data.spacegroup()->operations();
+  std::vector<double> inv_d2(multipliers.size());
+  for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), ++i)
+    inv_d2[i] = data.unit_cell().calculate_1_d2(data.get_hkl(n));
+  std::vector<int> bin_index = binner.get_bins_from_1_d2(inv_d2);
+  std::vector<CountAndSum> stats(binner.size());
+  for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), i++) {
+    Miller hkl = data.get_hkl(n);
+    double f = data.get_num(n + fcol_idx);
+    if (!std::isnan(f)) {
+      int epsilon = gops.epsilon_factor(hkl);
+      double inv_epsilon = 1.0 / epsilon;
+      double f2 = f * f * inv_epsilon;
+      multipliers[i] = std::sqrt(inv_epsilon);
+      CountAndSum& cs = stats[bin_index[i]];
+      cs.n++;
+      cs.sum += f2;
+    }
+  }
+
+  // simple smoothing with kernel [0.75 1 0.75]
+  std::vector<double> smoothed(stats.size());
+  {
+    const double k = 0.75;
+    smoothed[0] = (stats[0].sum + k * stats[1].sum) / (stats[0].n + k * stats[1].n);
+    size_t n = stats.size() - 1;
+    for (size_t i = 1; i < n; ++i)
+      smoothed[i] = (stats[i].sum + k * (stats[i-1].sum + stats[i+1].sum))
+                  / (stats[i].n + k * (stats[i-1].n + stats[i+1].n));
+    smoothed[n] = (stats[n].sum + k * stats[n-1].sum) / (stats[n].n + k * stats[n-1].n);
+  }
+
+#if 0
+  {
+    // print shell statistics
+    std::vector<int> refl_counts(binner.size());
+    printf(" shell\t    #F\t    d\t <F^2>\tsmoothd\t  #refl\t mid d\n");
+    for (int idx : bin_index)
+      ++refl_counts[idx];
+    for (size_t i = 0; i < binner.size(); ++i) {
+      double d = 1 / std::sqrt(binner.limits[i]);
+      double mid_d = 1 / std::sqrt(binner.mids[i]);
+      double avg_f2 = stats[i].sum / stats[i].n;
+      printf("%6zu\t%6d\t%7.3f\t%7.0f\t%7.0f\t%6d\t%7.3f\n",
+             i+1, stats[i].n, d, avg_f2, smoothed[i], refl_counts[i], mid_d);
+    }
+    printf("\n");
+  }
+#endif
+
+  for (double& x : smoothed)
+    x = std::sqrt(x);
+  for (size_t i = 0; i < multipliers.size(); ++i) {
+    double x = inv_d2[i];
+    int bin = bin_index[i];
+    double rms = smoothed[bin];
+    if (x > binner.mids.front() && x < binner.mids.back()) {
+      // linear interpolation in 1/d^2
+      if (x > binner.mids[bin])
+        ++bin;
+      double x0 = binner.mids[bin - 1];
+      double x1 = binner.mids[bin];
+      double y0 = smoothed[bin - 1];
+      double y1 = smoothed[bin];
+      assert(x0 <= x && x <= x1);
+      rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0);
+    }
+    multipliers[i] /= rms;
+  }
+  return multipliers;
+}
+
+} // namespace gemmi
+#endif
diff --git a/include/gemmi/stats.hpp b/include/gemmi/stats.hpp
index f2c7d7291..650fee3e2 100644
--- a/include/gemmi/stats.hpp
+++ b/include/gemmi/stats.hpp
@@ -11,13 +11,6 @@
 
 namespace gemmi {
 
-struct Mean {
-  int n = 0;
-  double sum = 0.;
-  void add_point(double x) { ++n; sum += x; }
-  double get_mean() const { return sum / n; }
-};
-
 // popular single-pass algorithm for calculating variance and mean
 struct Variance {
   int n = 0;
diff --git a/prog/ecalc.cpp b/prog/ecalc.cpp
index 187d28385..4cd3dca76 100644
--- a/prog/ecalc.cpp
+++ b/prog/ecalc.cpp
@@ -1,10 +1,9 @@
 // Copyright 2023 Global Phasing Ltd.
 
 #include <cstdio>   // for printf, fprintf
-#include <cstdlib>  // for atof
 #include "gemmi/binner.hpp"  // for Binner
 #include "gemmi/mtz.hpp"     // for Mtz
-#include "gemmi/stats.hpp"   // for Mean
+#include "gemmi/ecalc.hpp"   // for Mean
 
 #define GEMMI_PROG ecalc
 #include "options.h"
@@ -17,7 +16,7 @@ enum OptionIndex {
 
 struct EcalcArg: public Arg {
   static option::ArgStatus MethodChoice(const option::Option& option, bool msg) {
-    return Arg::Choice(option, msg, {"ma", "2", "3", "ec"});
+    return Arg::Choice(option, msg, {/*"ma",*/ "2", "3", "ec"});
   }
 };
 
@@ -49,8 +48,7 @@ const option::Descriptor Usage[] = {
     "\nColumn for SIGF is always the next column after F (the type must be Q)."
     "\nIf INPUT.mtz has column E, it is replaced in OUTPUT.mtz."
     "\nOtherwise, a new column is appended."
-    "\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E."
-    "\nTo print the list of resolution shells add -vv." },
+    "\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E." },
   { 0, 0, 0, 0, 0, 0 }
 };
 
@@ -72,8 +70,6 @@ int GEMMI_MAIN(int argc, char **argv) {
   try {
     Mtz mtz;
     mtz.read_file_gz(input);
-    if (mtz.spacegroup == nullptr)
-      gemmi::fail("Unknown space group of the input data: ", input);
     const Mtz::Column* fcol = &mtz.get_column_with_label(f_label);
     if (use_sigma && !fcol->get_next_column_if_type('Q'))
         gemmi::fail("Column ", f_label, " not followed by sigma column. Use --no-sigma.\n");
@@ -100,7 +96,6 @@ int GEMMI_MAIN(int argc, char **argv) {
     }
     if (!mtz.has_data())  // shouldn't happen
       gemmi::fail("no data");
-    gemmi::Binner binner;
     int f_count = 0;
     for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
       if (!std::isnan(mtz.data[n + fcol_idx]))
@@ -114,90 +109,28 @@ int GEMMI_MAIN(int argc, char **argv) {
                    mtz.nreflections, f_count, nbins);
     if (nbins < 3)
       gemmi::fail("not enough resolution bins");
-    bool use_moving_average = false;
+    //bool use_moving_average = false;
     auto method = gemmi::Binner::Method::Dstar3;
     if (p.options[Method])
       switch (p.options[Method].arg[0]) {
-        case '2': method = gemmi::Binner::Method::Dstar3; break;
+        case '2': method = gemmi::Binner::Method::Dstar2; break;
         case 'e': method = gemmi::Binner::Method::EqualCount; break;
-        case 'm': use_moving_average = true; break;
+        //case 'm': use_moving_average = true; break;
       }
     if (verbose > 0)
       std::fprintf(stderr, "Calculating E ...\n");
-    gemmi::GroupOps gops = mtz.spacegroup->operations();
-    std::vector<double> computed_values(mtz.nreflections, NAN);
-    if (use_moving_average) {
-      //TODO
-    } else {
-      std::vector<double> mids = binner.setup_mid(nbins, method, gemmi::MtzDataProxy{mtz});
-      std::vector<double> inv_d2(mtz.nreflections);
-      for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i)
-        inv_d2[i] = mtz.cell.calculate_1_d2(mtz.get_hkl(n));
-      std::vector<int> bin_index = binner.get_bins_from_1_d2(inv_d2);
-      std::vector<gemmi::Mean> denom(binner.size());
-      for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), i++) {
-        gemmi::Miller hkl = mtz.get_hkl(n);
-        double f = mtz.data[n + fcol_idx];
-        if (!std::isnan(f)) {
-          double inv_epsilon = 1.0 / gops.epsilon_factor(hkl);
-          double f2 = f * f * inv_epsilon;
-          computed_values[i] = std::sqrt(inv_epsilon);
-          denom[bin_index[i]].add_point(f2);
-        }
-      }
-
-      // simple smoothing with kernel [0.75 1 0.75]
-      std::vector<double> smoothed(denom.size());
-      {
-        const double k = 0.75;
-        smoothed[0] = (denom[0].sum + k * denom[1].sum) / (denom[0].n + k * denom[1].n);
-        size_t n = denom.size() - 1;
-        for (size_t i = 1; i < n; ++i)
-          smoothed[i] = (denom[i].sum + k * (denom[i-1].sum + denom[i+1].sum))
-                      / (denom[i].n + k * (denom[i-1].n + denom[i+1].n));
-        smoothed[n] = (denom[n].sum + k * denom[n-1].sum) / (denom[n].n + k * denom[n-1].n);
-      }
+    gemmi::Binner binner;
+    gemmi::MtzDataProxy data_proxy{mtz};
+    binner.setup(nbins, method, data_proxy, nullptr, /*with_mids=*/true, fcol_idx);
+    std::vector<double> multipliers
+      = gemmi::calculate_amplitude_normalizers(data_proxy, fcol_idx, binner);
 
-      if (verbose > 1) {
-        // print shell statistics
-        std::vector<int> refl_counts(binner.size());
-        printf(" shell\t    #F\t    d\t <F^2>\tsmoothd\t  #refl\t mid d\n");
-        for (int idx : bin_index)
-          ++refl_counts[idx];
-        for (size_t i = 0; i < binner.size(); ++i) {
-          double d = 1 / std::sqrt(binner.limits[i]);
-          double mid_d = 1 / std::sqrt(mids[i]);
-          printf("%6zu\t%6d\t%6.2f\t%7.0f\t%7.0f\t%6d\t%6.2f\n",
-                 i+1, denom[i].n, d, denom[i].get_mean(), smoothed[i], refl_counts[i], mid_d);
-        }
-        printf("\n");
-      }
-      for (double& x : smoothed)
-        x = std::sqrt(x);
-      for (size_t i = 0; i < computed_values.size(); ++i) {
-        double x = inv_d2[i];
-        int bin = bin_index[i];
-        double rms = smoothed[bin];
-        if (x > mids.front() && x < mids.back()) {
-          // linear interpolation in 1/d^2
-          if (x > mids[bin])
-            ++bin;
-          double x0 = mids[bin - 1];
-          double x1 = mids[bin];
-          double y0 = smoothed[bin - 1];
-          double y1 = smoothed[bin];
-          assert(x0 <= x && x <= x1);
-          rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0);
-        }
-        computed_values[i] /= rms;
-      }
-    }
     if (verbose > 0)
       std::fprintf(stderr, "Writing %s ...\n", output);
     for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i) {
-      mtz.data[n + ecol.idx] *= float(computed_values[i]);
+      mtz.data[n + ecol.idx] *= float(multipliers[i]);
       if (use_sigma)
-        mtz.data[n + ecol.idx + 1] *= float(computed_values[i]);
+        mtz.data[n + ecol.idx + 1] *= float(multipliers[i]);
     }
     mtz.write_to_file(output);
   } catch (std::runtime_error& e) {
diff --git a/python/hkl.cpp b/python/hkl.cpp
index ebd2c5a2f..1fbcc0508 100644
--- a/python/hkl.cpp
+++ b/python/hkl.cpp
@@ -4,12 +4,13 @@
 #include "gemmi/refln.hpp"
 #include "gemmi/fourier.hpp"  // for get_size_for_hkl, get_f_phi_on_grid, ...
 #include "tostr.hpp"
-#include "gemmi/fprime.hpp"
+#include "gemmi/fprime.hpp"   // for cromer_liberman
 #include "gemmi/reciproc.hpp" // for count_reflections, make_miller_vector
 #include "gemmi/cif2mtz.hpp"  // for CifToMtz
 #include "gemmi/mtz2cif.hpp"  // for MtzToCif
 #include "gemmi/merge.hpp"    // for Intensities
 #include "gemmi/binner.hpp"   // for Binner
+#include "gemmi/ecalc.hpp"    // for calculate_amplitude_normalizers
 
 #include "common.h"
 #include "arrvec.h"  // py_array_from_vector
@@ -269,11 +270,13 @@ void add_hkl(py::module& m) {
     .def(py::init<>())
     .def("setup", [](Binner& self, int nbins, Binner::Method method,
                      const Mtz& mtz, const UnitCell* cell) {
-        return self.setup(nbins, method, MtzDataProxy{mtz}, cell);
+        self.setup(nbins, method, MtzDataProxy{mtz}, cell);
+        return self.size();  // for compatibility with older versions
     }, py::arg("nbins"), py::arg("method"), py::arg("mtz"), py::arg("cell")=nullptr)
     .def("setup", [](Binner& self, int nbins, Binner::Method method,
                      const ReflnBlock& r, const UnitCell* cell) {
-        return self.setup(nbins, method, ReflnDataProxy(r), cell);
+        self.setup(nbins, method, ReflnDataProxy(r), cell);
+        return self.size();  // for compatibility with older versions
     }, py::arg("nbins"), py::arg("method"), py::arg("r"), py::arg("cell")=nullptr)
     .def("setup", [](Binner& self, int nbins, Binner::Method method,
                      py::array_t<int> hkl, const UnitCell* cell) {
@@ -334,6 +337,13 @@ void add_hkl(py::module& m) {
 
   m.def("combine_correlations", &combine_correlations);
 
+  m.def("calculate_amplitude_normalizers",
+        [](const Mtz& mtz, const std::string& f_col, const Binner& binner) {
+      const Mtz::Column& f = mtz.get_column_with_label(f_col);
+      return py_array_from_vector(
+          calculate_amplitude_normalizers(MtzDataProxy{mtz}, f.idx, binner));
+  });
+
   py::class_<HklMatch>(m, "HklMatch")
     .def(py::init([](py::array_t<int, py::array::c_style> hkl,
                      py::array_t<int, py::array::c_style> ref) {