Add units agnostic harmonisation (#42)

* Add failing test of units handling via standard interfaces * Add test of convenience handling method * Pass tests of single method * Fix tests * Add first test of multiple timeseries handling * Pass first multi timeseries test * Pass tests of handling multiple timeseries * Add tests of error handling * Avoid warning * Format * Satisfy stickler * Update CI dependencies * Typo * Fix dependencies again * Test multiple matching overrides * Test default decision tree propogation * Finalise last test * Add example notebook * Docstring * Format * Appease stickler
iiasa · Mar 17, 2022 · 6d81bb1 · 6d81bb1
1 parent 867a173
commit 6d81bb1
Show file tree

Hide file tree

Showing 10 changed files with 4,333 additions and 30 deletions.
diff --git a/.github/workflows/ci-cd-workflow.yml b/.github/workflows/ci-cd-workflow.yml
@@ -42,7 +42,7 @@ jobs:
         conda env update --file ci/environment-conda-default.yml
         conda env update --file ci/environment-conda-forge.yml
         conda env update --file doc/environment.yml
-        pip install -e .[tests,deploy]
+        pip install -e .[tests,deploy,units]
     # if we want to remove stickler
     # - name: Run format and linting tests
     #   shell: bash -l {0}
@@ -103,7 +103,7 @@ jobs:
         conda env update --file ci/environment-conda-default.yml
         conda env update --file ci/environment-conda-forge.yml
         conda env update --file doc/environment.yml
-        pip install -e .[tests,deploy]
+        pip install -e .[tests,deploy,units]
     - name: Install ipopt (${{ runner.os }})
       # see https://github.com/conda-forge/ipopt-feedstock/issues/55
       if: startsWith(runner.os, 'Windows')
@@ -161,7 +161,7 @@ jobs:
         conda env update --file ci/environment-conda-forge.yml
         conda env update --file doc/environment.yml
         conda install -q pandas==${{ matrix.pandas-version }}
-        pip install .[tests]
+        pip install .[tests,units]
     - name: Run tests
       shell: bash -l {0}
       run: |
@@ -206,7 +206,7 @@ jobs:
         conda env update --file ci/environment-conda-default.yml
         conda env update --file ci/environment-conda-forge.yml
         conda env update --file doc/environment.yml
-        pip install -e .[tests,deploy]
+        pip install -e .[tests,deploy,units]
     - name: Download data
       shell: bash -l {0}
       env:

diff --git a/Makefile b/Makefile
@@ -87,14 +87,14 @@ docs: $(VENV_DIR)  ## make the docs
 .PHONY: virtual-environment
 virtual-environment: $(VENV_DIR)  ## make virtual environment for development
 
-$(VENV_DIR):  $(CI_ENVIRONMENT_CONDA_DEFAULT_FILE) $(CI_ENVIRONMENT_CONDA_FORGE_FILE) $(ENVIRONMENT_DOC_FILE)
+$(VENV_DIR):  setup.py $(CI_ENVIRONMENT_CONDA_DEFAULT_FILE) $(CI_ENVIRONMENT_CONDA_FORGE_FILE) $(ENVIRONMENT_DOC_FILE)
 	$(CONDA_EXE) config --add channels conda-forge # sets conda-forge as highest priority
 	# install requirements
 	$(CONDA_EXE) env update --name $(CONDA_DEFAULT_ENV) --file $(CI_ENVIRONMENT_CONDA_DEFAULT_FILE)
 	$(CONDA_EXE) env update --name $(CONDA_DEFAULT_ENV) --file $(CI_ENVIRONMENT_CONDA_FORGE_FILE)
 	$(CONDA_EXE) env update --name $(CONDA_DEFAULT_ENV) --file $(ENVIRONMENT_DOC_FILE)
 	# Install development setup
-	$(VENV_DIR)/bin/pip install -e .[tests,deploy]
+	$(VENV_DIR)/bin/pip install -e .[tests,deploy,units]
 	touch $(VENV_DIR)
 
 .PHONY: release-on-conda

diff --git a/aneris/convenience.py b/aneris/convenience.py
@@ -0,0 +1,175 @@
+from openscm_units import unit_registry
+
+from .harmonize import Harmonizer, default_methods
+from .errors import (
+    AmbiguousHarmonisationMethod,
+    MissingHarmonisationYear,
+    MissingHistoricalError,
+)
+from .methods import harmonize_factors
+
+
+def harmonise_all(scenarios, history, harmonisation_year, overrides=None):
+    """
+    Harmonise all timeseries in ``scenarios`` to match ``history``
+
+    Parameters
+    ----------
+    scenarios : :obj:`pd.DataFrame`
+        :obj:`pd.DataFrame` containing the timeseries to be harmonised
+
+    history : :obj:`pd.DataFrame`
+        :obj:`pd.DataFrame` containing the historical timeseries to which
+        ``scenarios`` should be harmonised
+
+    harmonisation_year : int
+        The year in which ``scenarios`` should be harmonised to ``history``
+
+    overrides : :obj:`pd.DataFrame`
+        If not provided, the default aneris decision tree is used. Otherwise,
+        ``overrides`` must be a :obj:`pd.DataFrame` containing any
+        specifications for overriding the default aneris methods. Each row
+        specifies one override. The override method is specified in the
+        "method" columns. The other columns specify which of the timeseries in
+        ``scenarios`` should use this override by specifying metadata to match (
+        e.g. variable, region). If a cell has a null value (evaluated using
+        `pd.isnull()`) then that scenario characteristic will not be used for
+        filtering for that override e.g. if you have a row with "method" equal
+        to "constant_ratio", region equal to "World" and variable is null then
+        all timeseries in the World region will use the "constant_ratio"
+        method. In contrast, if you have a row with "method" equal to
+        "constant_ratio", region equal to "World" and variable is
+        "Emissions|CO2" then only timeseries with variable equal to
+        "Emissions|CO2" and region equal to "World" will use the
+        "constant_ratio" method.
+
+    Returns
+    -------
+    :obj:`pd.DataFrame`
+        The harmonised timeseries
+
+    Notes
+    -----
+    This interface is nowhere near as sophisticated as aneris' other
+    interfaces. It simply harmonises timeseries, it does not check sectoral
+    sums or other possible errors which can arise when harmonising. If you need
+    such features, do not use this interface.
+
+    Raises
+    ------
+    MissingHistoricalError
+        No historical data is provided for a given timeseries
+
+    MissingHarmonisationYear
+        A value for the harmonisation year is missing or is null in ``history``
+
+    AmbiguousHarmonisationMethod
+        ``overrides`` do not uniquely specify the harmonisation method for a
+        given timeseries
+    """
+    # use groupby to maintain indexes, not sure if there's a better way because
+    # this will likely be super slow
+    res = scenarios.groupby(scenarios.index.names).apply(
+        _harmonise_single, history, harmonisation_year, overrides
+    )
+
+    return res
+
+
+def _harmonise_single(timeseries, history, harmonisation_year, overrides):
+    assert timeseries.shape[0] == 1
+    # unclear why we don't use pyam or scmdata for filtering
+    mdata = {
+        k: v for k, v in zip(timeseries.index.names, timeseries.index.to_list()[0])
+    }
+
+    variable = mdata["variable"]
+    region = mdata["region"]
+
+    hist_variable = history.index.get_level_values("variable") == variable
+    hist_region = history.index.get_level_values("region") == region
+    relevant_hist = history[hist_variable & hist_region]
+
+    if relevant_hist.empty:
+        error_msg = "No historical data for `{}` `{}`".format(region, variable)
+        raise MissingHistoricalError(error_msg)
+
+    if harmonisation_year not in relevant_hist:
+        error_msg = "No historical data for year {} for `{}` `{}`".format(
+            harmonisation_year, region, variable
+        )
+        raise MissingHarmonisationYear(error_msg)
+
+    if relevant_hist[harmonisation_year].isnull().all():
+        error_msg = "Historical data is null for year {} for `{}` `{}`".format(
+            harmonisation_year, region, variable
+        )
+        raise MissingHarmonisationYear(error_msg)
+
+    # convert units
+    hist_unit = relevant_hist.index.get_level_values("unit").unique()[0]
+    relevant_hist = _convert_units(
+        relevant_hist, current_unit=hist_unit, target_unit=mdata["unit"]
+    )
+    # set index for rest of processing (as units are now consistent)
+    relevant_hist.index = timeseries.index.copy()
+
+    if overrides is not None:
+        method = overrides.copy()
+        for key, value in mdata.items():
+            if key in method:
+                method = method[(method[key] == value) | method[key].isnull()]
+
+    if overrides is not None and method.shape[0] > 1:
+        error_msg = (
+            "Ambiguous harmonisation overrides for metdata `{}`, the "
+            "following methods match: {}".format(mdata, method)
+        )
+        raise AmbiguousHarmonisationMethod(
+            "More than one override for metadata: {}".format(mdata)
+        )
+
+    if overrides is None or method.empty:
+        default, _ = default_methods(
+            relevant_hist, timeseries, base_year=harmonisation_year
+        )
+        method_to_use = default.values[0]
+
+    else:
+        method_to_use = method["method"].values[0]
+
+    return _harmonise_aligned(
+        timeseries, relevant_hist, harmonisation_year, method_to_use
+    )
+
+
+def _convert_units(inp, current_unit, target_unit):
+    # would be simpler using scmdata or pyam
+    out = inp.copy()
+    out.iloc[:, :] = (
+        (out.values * unit_registry(current_unit)).to(target_unit).magnitude
+    )
+    out = out.reset_index("unit")
+    out["unit"] = target_unit
+    out = out.set_index("unit", append=True)
+
+    return out
+
+
+def _harmonise_aligned(timeseries, history, harmonisation_year, method):
+    # seems odd that the methods are stored in a class instance
+    harmonise_func = Harmonizer._methods[method]
+    delta = _get_delta(timeseries, history, method, harmonisation_year)
+
+    return harmonise_func(timeseries, delta, harmonize_year=harmonisation_year)
+
+
+def _get_delta(timeseries, history, method, harmonisation_year):
+    if method == "budget":
+        return history
+
+    offset, ratio = harmonize_factors(timeseries, history, harmonisation_year)
+    if "ratio" in method:
+        return ratio
+
+    return offset
diff --git a/aneris/errors.py b/aneris/errors.py
@@ -0,0 +1,16 @@
+class AmbiguousHarmonisationMethod(ValueError):
+    """
+    Error raised when harmonisation methods are ambiguous
+    """
+
+
+class MissingHistoricalError(ValueError):
+    """
+    Error raised when historical data is missing
+    """
+
+
+class MissingHarmonisationYear(ValueError):
+    """
+    Error raised when the harmonisation year is missing
+    """
diff --git a/aneris/methods.py b/aneris/methods.py
@@ -139,9 +139,10 @@ def reduce_offset(df, offset, final_year='2050', harmonize_year='2015'):
     df = df.copy()
     yi, yf = int(harmonize_year), int(final_year)
     numcols = utils.numcols(df)
+    numcols_int = [int(v) for v in numcols]
     # get factors that reduce from 1 to 0; factors before base year are > 1
     f = lambda year: -(year - yi) / float(yf - yi) + 1
-    factors = [f(int(year)) if year <= final_year else 0.0 for year in numcols]
+    factors = [f(year) if year <= yf else 0.0 for year in numcols_int]
     # add existing values to offset time series
     offsets = pd.DataFrame(np.outer(offset, factors),
                            columns=numcols, index=offset.index)
@@ -171,17 +172,19 @@ def reduce_ratio(df, ratios, final_year='2050', harmonize_year='2015'):
     df = df.copy()
     yi, yf = int(harmonize_year), int(final_year)
     numcols = utils.numcols(df)
+    numcols_int = [int(v) for v in numcols]
     # get factors that reduce from 1 to 0, but replace with 1s in years prior
     # to harmonization
     f = lambda year: -(year - yi) / float(yf - yi) + 1
-    prefactors = [f(int(harmonize_year))
-                  for year in numcols if year < harmonize_year]
-    postfactors = [f(int(year)) if year <= final_year else 0.0
-                   for year in numcols if year >= harmonize_year]
+    prefactors = [f(yi)
+                  for year in numcols_int if year < yi]
+    postfactors = [f(year) if year <= yf else 0.0
+                   for year in numcols_int if year >= yi]
     factors = prefactors + postfactors
     # multiply existing values by ratio time series
     ratios = pd.DataFrame(np.outer(ratios - 1, factors),
                           columns=numcols, index=ratios.index) + 1
+
     df[numcols] = df[numcols] * ratios
     return df
 
@@ -402,7 +405,9 @@ def default_method_choice(
             return 'constant_offset'
     else:
         # is this co2?
-        if row.gas == 'CO2':
+        # ZN: This gas dependence isn't documented in the default
+        # decision tree
+        if hasattr(row, "gas") and row.gas == 'CO2':
             return ratio_method
         # is cov big?
         if np.isfinite(row['cov']) and row['cov'] > luc_cov_threshold:
@@ -469,8 +474,12 @@ def default_methods(hist, model, base_year, method_choice=None, **kwargs):
         kwargs['luc_cov_threshold'] = 10
 
     y = str(base_year)
-    h = hist[y]
-    m = model[y]
+    try:
+        h = hist[base_year]
+        m = model[base_year]
+    except KeyError:
+        h = hist[y]
+        m = model[y]
     dH = (h - m).abs() / h
     f = h / m
     dM = (model.max(axis=1) - model.min(axis=1)).abs() / model.max(axis=1)