From 096c210d995ebf7078430739e8e6baccb3320a23 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Mon, 9 Sep 2024 11:30:51 +0100 Subject: [PATCH 01/50] docs|feat [gravity model]: changed type hints in multitld calibration to reflect outputs. added summaries method to calibrate results. --- src/caf/distribute/gravity_model/core.py | 37 +++++++++++++++++++ .../distribute/gravity_model/multi_area.py | 4 +- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index fca906d..acddcd3 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -124,6 +124,25 @@ def plot_distributions(self) -> figure.Figure: ax.legend() return fig + + def summary(self, key:Optional[str|int]=None)->pd.Series: + """summarise the run parameters into a series with name set to key. + + + Outputs the gravity model achieved parameters and the convergence. + Parameters + ---------- + key : str | int + key to indentify the data - used as name of the series + + Returns + ------- + pd.DataFrame + a summary of the calibration + """ + output_params = self.cost_params.copy() + output_params["convergence"]=self.cost_convergence + return pd.Series(output_params, name=key) @dataclasses.dataclass @@ -163,6 +182,24 @@ class GravityModelRunResults(GravityModelResults): cost_params: Optional[dict[str, Any]] = None + def summary(self, key:Optional[str|int]=None)->pd.Series: + """summarise the run parameters into a series with name set to key + + + Outputs the gravity model parameters used to generate the distribution. + Parameters + ---------- + key : str | int + key to indentify the data - used as name of the series + + Returns + ------- + pd.DataFrame + a summary of the calibration + """ + return pd.Series(self.cost_params, name=key) + + class GravityModelBase(abc.ABC): """Base Class for gravity models. diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 4090ed3..5173194 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -251,7 +251,7 @@ def _calibrate( default_retry: bool = True, verbose: int = 0, **kwargs, - ) -> dict[str, GravityModelCalibrateResults]: + ) -> dict[str|int, GravityModelCalibrateResults]: params_len = len(self.dists[0].function_params) ordered_init_params = [] for dist in self.dists: @@ -341,7 +341,7 @@ def calibrate( running_log_path: os.PathLike, *args, **kwargs, - ) -> GravityModelCalibrateResults: + ) -> dict[str|int, GravityModelCalibrateResults]: """Find the optimal parameters for self.cost_function. Optimal parameters are found using `scipy.optimize.least_squares` From 559c925f0c97484d8f4dbbdd61566750ca8f8f88 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Mon, 9 Sep 2024 12:06:51 +0100 Subject: [PATCH 02/50] fixes: changed summary to a property and added an abrast base property --- src/caf/distribute/gravity_model/core.py | 29 ++++++++++++------------ 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index acddcd3..a6ac68a 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -56,6 +56,11 @@ class GravityModelResults: cost_distribution: cost_utils.CostDistribution cost_convergence: float value_distribution: np.ndarray + + @property + @abc.abstractmethod + def summary(self): + return @dataclasses.dataclass @@ -125,15 +130,12 @@ def plot_distributions(self) -> figure.Figure: return fig - def summary(self, key:Optional[str|int]=None)->pd.Series: - """summarise the run parameters into a series with name set to key. + @property + def summary(self)->pd.Series: + """summary of the GM calibration parameters as a series. Outputs the gravity model achieved parameters and the convergence. - Parameters - ---------- - key : str | int - key to indentify the data - used as name of the series Returns ------- @@ -142,7 +144,7 @@ def summary(self, key:Optional[str|int]=None)->pd.Series: """ output_params = self.cost_params.copy() output_params["convergence"]=self.cost_convergence - return pd.Series(output_params, name=key) + return pd.Series(output_params) @dataclasses.dataclass @@ -181,23 +183,20 @@ class GravityModelRunResults(GravityModelResults): cost_function: Optional[cost_functions.CostFunction] = None cost_params: Optional[dict[str, Any]] = None - - def summary(self, key:Optional[str|int]=None)->pd.Series: - """summarise the run parameters into a series with name set to key + @property + def summary(self)->pd.Series: + """summary of the GM run parameters as a series. Outputs the gravity model parameters used to generate the distribution. Parameters - ---------- - key : str | int - key to indentify the data - used as name of the series Returns ------- pd.DataFrame - a summary of the calibration + a summary of the run """ - return pd.Series(self.cost_params, name=key) + return pd.Series(self.cost_params) class GravityModelBase(abc.ABC): From 61ac7cf920da8cf64bb173b0fe81b20db2c82724 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Mon, 9 Sep 2024 17:28:40 +0100 Subject: [PATCH 03/50] api changes: IN PROGRESS changed mult area gm calibrator init vars - temp fix to test GM --- .gitignore | 3 ++ src/caf/distribute/gravity_model/core.py | 4 -- .../distribute/gravity_model/multi_area.py | 52 +++++++++---------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 9d37853..db26a51 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,6 @@ dmypy.json # Pyre type checker .pyre/ + + +settings.json diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index a6ac68a..0db28e2 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -57,10 +57,6 @@ class GravityModelResults: cost_convergence: float value_distribution: np.ndarray - @property - @abc.abstractmethod - def summary(self): - return @dataclasses.dataclass diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 5173194..340154f 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- """Implementation of a self-calibrating single area gravity model.""" # Built-Ins +from __future__ import annotations + import functools import logging import os @@ -89,6 +91,14 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False +@dataclass +class MultiDistData: + distributions: list[MultiCostDistribution] + init_params: dict[str, float] + log_path: Path + furness_tolerance: float = 1e-6 + furness_jac: float = False + @dataclass class MultiCostDistribution: @@ -118,6 +128,10 @@ class MultiCostDistribution: zones: np.ndarray function_params: dict[str, float] +#TODO fix inputs + +#TODO wrapper + class MultiAreaGravityModelCalibrator(core.GravityModelBase): """ @@ -156,7 +170,12 @@ def __init__( col_targets: np.ndarray, cost_matrix: np.ndarray, cost_function: cost_functions.CostFunction, - params: Optional[MultiDistInput], + #TODO move these parameters as inputs of calibrate and run + distributions: list[MultiCostDistribution], + log_path: Path, + furness_tolerance: float = 1e-6, + furness_jac: float = False, + ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) self.row_targets = row_targets @@ -165,30 +184,11 @@ def __init__( raise IndexError("row_targets doesn't match cost_matrix") if len(col_targets) != cost_matrix.shape[1]: raise IndexError("col_targets doesn't match cost_matrix") - if params is not None: - self.tlds = pd.read_csv(params.tld_file) - self.tlds.rename( - columns={ - params.cat_col: "cat", - params.min_col: "min", - params.max_col: "max", - params.ave_col: "avg", - params.trips_col: "trips", - }, - inplace=True, - ) - self.lookup = pd.read_csv(params.tld_lookup_file) - self.lookup.rename( - columns={params.lookup_zone_col: "zone", params.lookup_cat_col: "cat"}, - inplace=True, - ) - self.tlds.set_index("cat", inplace=True) - self.lookup.sort_values("zone") - self.init_params = params.init_params - self.dists = self.process_tlds() - self.log_path = params.log_path - self.furness_tol = params.furness_tolerance - self.furness_jac = params.furness_jac + + self.dists = distributions + self.log_path = log_path + self.furness_tol = furness_tolerance + self.furness_jac = furness_jac def process_tlds(self): """Get distributions in the right format for a multi-area gravity model.""" @@ -563,7 +563,7 @@ def _gravity_function( return achieved_residuals # pylint:enable=too-many-locals - def run(self): + def run(self): """ Run the gravity_model without calibrating. From 50cc9b1b8331cc12dafe0da8ff32f9728b199fa9 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Tue, 10 Sep 2024 09:37:00 +0100 Subject: [PATCH 04/50] api changes --- .../distribute/gravity_model/multi_area.py | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 340154f..e601a99 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -19,7 +19,7 @@ # Local Imports from caf.distribute import cost_functions, furness from caf.distribute.gravity_model import core -from caf.distribute.gravity_model.core import GravityModelCalibrateResults +from caf.distribute.gravity_model.core import GravityModelCalibrateResults, GravityModelRunResults # # # CONSTANTS # # # LOG = logging.getLogger(__name__) @@ -171,11 +171,6 @@ def __init__( cost_matrix: np.ndarray, cost_function: cost_functions.CostFunction, #TODO move these parameters as inputs of calibrate and run - distributions: list[MultiCostDistribution], - log_path: Path, - furness_tolerance: float = 1e-6, - furness_jac: float = False, - ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) self.row_targets = row_targets @@ -185,10 +180,6 @@ def __init__( if len(col_targets) != cost_matrix.shape[1]: raise IndexError("col_targets doesn't match cost_matrix") - self.dists = distributions - self.log_path = log_path - self.furness_tol = furness_tolerance - self.furness_jac = furness_jac def process_tlds(self): """Get distributions in the right format for a multi-area gravity model.""" @@ -243,6 +234,9 @@ def _create_seed_matrix(self, cost_distributions, cost_args, params_len): # pylint: disable=too-many-locals def _calibrate( self, + distributions: list[MultiCostDistribution], + running_log_path:Path, + furness_jac: bool = False, diff_step: float = 1e-8, ftol: float = 1e-4, xtol: float = 1e-4, @@ -252,26 +246,27 @@ def _calibrate( verbose: int = 0, **kwargs, ) -> dict[str|int, GravityModelCalibrateResults]: - params_len = len(self.dists[0].function_params) + params_len = len(distributions[0].function_params) ordered_init_params = [] - for dist in self.dists: + for dist in distributions: params = self._order_cost_params(dist.function_params) for val in params: ordered_init_params.append(val) gravity_kwargs: dict[str, Any] = { - "running_log_path": self.log_path, - "cost_distributions": self.dists, + "running_log_path": running_log_path, + "cost_distributions": distributions, "diff_step": diff_step, "params_len": params_len, + "furness_jac": furness_jac } optimise_cost_params = functools.partial( optimize.least_squares, fun=self._gravity_function, method=self._least_squares_method, bounds=( - self._order_bounds()[0] * len(self.dists), - self._order_bounds()[1] * len(self.dists), + self._order_bounds()[0] * len(distributions), + self._order_bounds()[1] * len(distributions), ), jac=self._jacobian_function, verbose=verbose, @@ -321,7 +316,7 @@ def _calibrate( assert self.achieved_cost_dist is not None results = {} - for i, dist in enumerate(self.dists): + for i, dist in enumerate(distributions): result_i = GravityModelCalibrateResults( cost_distribution=self.achieved_cost_dist[i], cost_convergence=self.achieved_convergence[dist.name], @@ -338,6 +333,7 @@ def _calibrate( def calibrate( self, + distributions: list[MultiCostDistribution], running_log_path: os.PathLike, *args, **kwargs, @@ -427,13 +423,14 @@ def calibrate( `caf.distribute.furness.doubly_constrained_furness()` `scipy.optimize.least_squares()` """ - for dist in self.dists: + for dist in distributions: self.cost_function.validate_params(dist.function_params) self._validate_running_log(running_log_path) self._initialise_internal_params() return self._calibrate( # type: ignore + distributions, + running_log_path, *args, - running_log_path=running_log_path, **kwargs, ) @@ -442,6 +439,7 @@ def _jacobian_function( init_params: list[float], cost_distributions: list[MultiCostDistribution], diff_step: float, + furness_jac: bool, running_log_path, params_len, ): @@ -475,7 +473,7 @@ def _jacobian_function( adj_mat = base_mat.copy() adj_mat[dist.zones] = adj_mat_slice adj_dist = adj_mat * furness_factor - if self.furness_jac: + if furness_jac: adj_dist, *_ = furness.doubly_constrained_furness( seed_vals=adj_dist, row_targets=self.achieved_distribution.sum(axis=1), @@ -563,29 +561,29 @@ def _gravity_function( return achieved_residuals # pylint:enable=too-many-locals - def run(self): + def run(self, distributions: list[MultiCostDistribution], running_log_path:Path)->dict[int|str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. This should be done when you have calibrating previously to find the correct parameters for the cost function. """ - params_len = len(self.dists[0].function_params) + params_len = len(distributions[0].function_params) cost_args = [] - for dist in self.dists: + for dist in distributions: for param in dist.function_params.values(): cost_args.append(param) self._gravity_function( init_params=cost_args, - cost_distributions=self.dists, - running_log_path=self.log_path, + cost_distributions=distributions, + running_log_path=running_log_path, params_len=params_len, ) assert self.achieved_cost_dist is not None results = {} - for i, dist in enumerate(self.dists): + for i, dist in enumerate(distributions): result_i = GravityModelCalibrateResults( cost_distribution=self.achieved_cost_dist[i], cost_convergence=self.achieved_convergence[dist.name], From a8be59d4115e2e4959925590aa0ddf156cc3246d Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Tue, 10 Sep 2024 10:19:07 +0100 Subject: [PATCH 05/50] bug fix wityh undefined class attr --- .../distribute/gravity_model/multi_area.py | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index e601a99..470d706 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -19,7 +19,10 @@ # Local Imports from caf.distribute import cost_functions, furness from caf.distribute.gravity_model import core -from caf.distribute.gravity_model.core import GravityModelCalibrateResults, GravityModelRunResults +from caf.distribute.gravity_model.core import ( + GravityModelCalibrateResults, + GravityModelRunResults, +) # # # CONSTANTS # # # LOG = logging.getLogger(__name__) @@ -91,6 +94,7 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False + @dataclass class MultiDistData: distributions: list[MultiCostDistribution] @@ -128,10 +132,6 @@ class MultiCostDistribution: zones: np.ndarray function_params: dict[str, float] -#TODO fix inputs - -#TODO wrapper - class MultiAreaGravityModelCalibrator(core.GravityModelBase): """ @@ -170,7 +170,7 @@ def __init__( col_targets: np.ndarray, cost_matrix: np.ndarray, cost_function: cost_functions.CostFunction, - #TODO move these parameters as inputs of calibrate and run + # TODO move these parameters as inputs of calibrate and run ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) self.row_targets = row_targets @@ -180,7 +180,6 @@ def __init__( if len(col_targets) != cost_matrix.shape[1]: raise IndexError("col_targets doesn't match cost_matrix") - def process_tlds(self): """Get distributions in the right format for a multi-area gravity model.""" dists = [] @@ -234,18 +233,19 @@ def _create_seed_matrix(self, cost_distributions, cost_args, params_len): # pylint: disable=too-many-locals def _calibrate( self, - distributions: list[MultiCostDistribution], - running_log_path:Path, + distributions: list[MultiCostDistribution], + running_log_path: Path, furness_jac: bool = False, diff_step: float = 1e-8, ftol: float = 1e-4, xtol: float = 1e-4, + furness_tol=1e-6, grav_max_iters: int = 100, failure_tol: float = 0, default_retry: bool = True, verbose: int = 0, **kwargs, - ) -> dict[str|int, GravityModelCalibrateResults]: + ) -> dict[str | int, GravityModelCalibrateResults]: params_len = len(distributions[0].function_params) ordered_init_params = [] for dist in distributions: @@ -258,7 +258,8 @@ def _calibrate( "cost_distributions": distributions, "diff_step": diff_step, "params_len": params_len, - "furness_jac": furness_jac + "furness_jac": furness_jac, + "furness_tol": furness_tol, } optimise_cost_params = functools.partial( optimize.least_squares, @@ -333,11 +334,11 @@ def _calibrate( def calibrate( self, - distributions: list[MultiCostDistribution], + distributions: list[MultiCostDistribution], running_log_path: os.PathLike, *args, **kwargs, - ) -> dict[str|int, GravityModelCalibrateResults]: + ) -> dict[str | int, GravityModelCalibrateResults]: """Find the optimal parameters for self.cost_function. Optimal parameters are found using `scipy.optimize.least_squares` @@ -428,7 +429,7 @@ def calibrate( self._validate_running_log(running_log_path) self._initialise_internal_params() return self._calibrate( # type: ignore - distributions, + distributions, running_log_path, *args, **kwargs, @@ -438,6 +439,7 @@ def _jacobian_function( self, init_params: list[float], cost_distributions: list[MultiCostDistribution], + furness_tol:int, diff_step: float, furness_jac: bool, running_log_path, @@ -478,7 +480,7 @@ def _jacobian_function( seed_vals=adj_dist, row_targets=self.achieved_distribution.sum(axis=1), col_targets=self.achieved_distribution.sum(axis=0), - tol=self.furness_tol / 10, + tol=furness_tol / 10, max_iters=20, warning=False, ) @@ -502,7 +504,14 @@ def _jacobian_function( return jacobian def _gravity_function( - self, init_params, cost_distributions, running_log_path, params_len, diff_step=0 + self, + init_params, + cost_distributions, + furness_tol, + running_log_path, + params_len, + diff_step=0, + **_, ): del diff_step @@ -511,7 +520,7 @@ def _gravity_function( seed_vals=base_mat, row_targets=self.row_targets, col_targets=self.col_targets, - tol=self.furness_tol, + tol=furness_tol, ) convergences, distributions, residuals = {}, [], [] for dist in cost_distributions: @@ -561,7 +570,9 @@ def _gravity_function( return achieved_residuals # pylint:enable=too-many-locals - def run(self, distributions: list[MultiCostDistribution], running_log_path:Path)->dict[int|str, GravityModelCalibrateResults]: + def run( + self, distributions: list[MultiCostDistribution], running_log_path: Path + ) -> dict[int | str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. From 519ce1f4d9864cbe4ee8edae580dc9953cceaf73 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Wed, 11 Sep 2024 14:50:40 +0100 Subject: [PATCH 06/50] Tidied up and added from pandas function --- .../distribute/gravity_model/multi_area.py | 140 ++++++++---------- 1 file changed, 58 insertions(+), 82 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 470d706..68178ef 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -94,16 +94,7 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False - -@dataclass -class MultiDistData: - distributions: list[MultiCostDistribution] - init_params: dict[str, float] - log_path: Path - furness_tolerance: float = 1e-6 - furness_jac: float = False - - + @dataclass class MultiCostDistribution: """ @@ -127,11 +118,59 @@ class MultiCostDistribution: implemented. """ + #cost_distribution: dict[id, cost_utils.CostDistribution] + #matrix_id_lookup: np.ndarray + #function_params: dict[id, dict[str,float]] + name: str cost_distribution: cost_utils.CostDistribution zones: np.ndarray function_params: dict[str, float] + @classmethod + def from_pandas( + cls, + category: str|int, + ordered_zones: pd.Series, + tld: pd.DataFrame, + cat_zone_correspondence: pd.DataFrame, + func_params: dict[str, float], + tld_cat_col: str = "category", + tld_min_col: str = "from", + tld_max_col: str = "to", + tld_avg_col: str = "av_distance", + tld_trips_col: str = "trips", + lookup_cat_col: str = "category", + lookup_zone_col: str = "zone_id", + )->MultiCostDistribution: + #get a list of zones that use this category of TLD + cat_zones = cat_zone_correspondence.loc[ + cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col + ].to_numpy() + + zones = ordered_zones.to_numpy() + + #tell user if we have zones in cat->lookup that arent in zones + if not np.all(np.isin(cat_zones, zones)): + missing_values = cat_zones[~np.isin(cat_zones, zones)] + raise ValueError( + f"The following values from cat->zone lookup are not present in the tld zones: {missing_values}" + ) + + # get the indices + cat_zone_indices = np.where(np.isin(cat_zones, zones)) + + # get tld for cat + cat_tld = tld[tld[tld_cat_col] == category] + + cat_cost_distribution = cost_utils.CostDistribution( + cat_tld, tld_min_col, tld_max_col, tld_avg_col, tld_trips_col, tld_avg_col + ) + + return cls( + category, cat_cost_distribution, cat_zone_indices, func_params + ) + class MultiAreaGravityModelCalibrator(core.GravityModelBase): """ @@ -158,10 +197,6 @@ class MultiAreaGravityModelCalibrator(core.GravityModelBase): The cost function to use when calibrating the gravity model. This function is applied to `cost_matrix` before Furnessing during calibration. - - params: Optional[MultiDistInput] - Info needed for a multi-distribution gravity model. See documentation - for MultiDistInput. """ def __init__( @@ -344,78 +379,19 @@ def calibrate( Optimal parameters are found using `scipy.optimize.least_squares` to fit the distributed row/col targets to `target_cost_distribution`. + NOTE: The achieved distribution is found by accessing self.achieved + distribution of the object this method is called on. The output of + this method shows the distribution and results for each individual TLD. + Parameters ---------- - init_params: - A dictionary of {parameter_name: parameter_value} to pass - into the cost function as initial parameters. - - running_log_path: - Path to output the running log to. This log will detail the - performance of the run and is written in .csv format. - - target_cost_distribution: - The cost distribution to calibrate towards during the calibration - process. - - diff_step: - Copied from scipy.optimize.least_squares documentation, where it - is passed to: - Determines the relative step size for the finite difference - approximation of the Jacobian. The actual step is computed as - `x * diff_step`. If None (default), then diff_step is taken to be a - conventional “optimal” power of machine epsilon for the finite - difference scheme used - - ftol: - The tolerance to pass to `scipy.optimize.least_squares`. The search - will stop once this tolerance has been met. This is the - tolerance for termination by the change of the cost function - - xtol: - The tolerance to pass to `scipy.optimize.least_squares`. The search - will stop once this tolerance has been met. This is the - tolerance for termination by the change of the independent - variables. - - grav_max_iters: - The maximum number of calibration iterations to complete before - termination if the ftol has not been met. - - failure_tol: - If, after initial calibration using `init_params`, the achieved - convergence is less than this value, calibration will be run again with - the default parameters from `self.cost_function`. - - default_retry: - If, after running with `init_params`, the achieved convergence - is less than `failure_tol`, calibration will be run again with the - default parameters of `self.cost_function`. - This argument is ignored if the default parameters are given - as `init_params. - - n_random_tries: - If, after running with default parameters of `self.cost_function`, - the achieved convergence is less than `failure_tol`, calibration will - be run again using random values for the cost parameters this - number of times. - - verbose: - Copied from scipy.optimize.least_squares documentation, where it - is passed to: - Level of algorithm’s verbosity: - - 0 (default) : work silently. - - 1 : display a termination report. - - 2 : display progress during iterations (not supported by ‘lm’ method). - - kwargs: - Additional arguments passed to self.gravity_furness. - Empty by default. The calling signature is: - `self.gravity_furness(seed_matrix, **kwargs)` - + distributions: list[MultiCostDistribution], + running_log_path: os.PathLike, + *args, + **kwargs, Returns ------- - results: + dict[str | int, GravityModelCalibrateResults]: An instance of GravityModelCalibrateResults containing the results of this run. From 05fba6007f225e0f02193947aa5165ccbc8d861c Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Thu, 12 Sep 2024 10:43:27 +0100 Subject: [PATCH 07/50] fixes: run method fixes --- src/caf/distribute/gravity_model/multi_area.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 68178ef..77a1a12 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -26,7 +26,7 @@ # # # CONSTANTS # # # LOG = logging.getLogger(__name__) - +DEFAULT_FURNESS_TOL= 1e-6 # pylint:disable=duplicate-code # Furness called with same parameters in single and multi-area @@ -274,7 +274,7 @@ def _calibrate( diff_step: float = 1e-8, ftol: float = 1e-4, xtol: float = 1e-4, - furness_tol=1e-6, + furness_tol=DEFAULT_FURNESS_TOL, grav_max_iters: int = 100, failure_tol: float = 0, default_retry: bool = True, @@ -547,7 +547,7 @@ def _gravity_function( # pylint:enable=too-many-locals def run( - self, distributions: list[MultiCostDistribution], running_log_path: Path + self, distributions: list[MultiCostDistribution], running_log_path: Path, furness_tol=DEFAULT_FURNESS_TOL ) -> dict[int | str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. @@ -566,6 +566,7 @@ def run( cost_distributions=distributions, running_log_path=running_log_path, params_len=params_len, + furness_tol=furness_tol ) assert self.achieved_cost_dist is not None From b52b91ff1974349ab2fc8f465c538f82873be895 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Fri, 13 Sep 2024 16:47:54 +0100 Subject: [PATCH 08/50] ran black --- src/caf/distribute/gravity_model/core.py | 15 ++++--- .../distribute/gravity_model/multi_area.py | 40 ++++++++++--------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 0db28e2..8fba13b 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -56,7 +56,6 @@ class GravityModelResults: cost_distribution: cost_utils.CostDistribution cost_convergence: float value_distribution: np.ndarray - @dataclasses.dataclass @@ -125,9 +124,9 @@ def plot_distributions(self) -> figure.Figure: ax.legend() return fig - + @property - def summary(self)->pd.Series: + def summary(self) -> pd.Series: """summary of the GM calibration parameters as a series. @@ -137,9 +136,9 @@ def summary(self)->pd.Series: ------- pd.DataFrame a summary of the calibration - """ + """ output_params = self.cost_params.copy() - output_params["convergence"]=self.cost_convergence + output_params["convergence"] = self.cost_convergence return pd.Series(output_params) @@ -180,10 +179,10 @@ class GravityModelRunResults(GravityModelResults): cost_params: Optional[dict[str, Any]] = None @property - def summary(self)->pd.Series: + def summary(self) -> pd.Series: """summary of the GM run parameters as a series. - + Outputs the gravity model parameters used to generate the distribution. Parameters @@ -191,7 +190,7 @@ def summary(self)->pd.Series: ------- pd.DataFrame a summary of the run - """ + """ return pd.Series(self.cost_params) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 77a1a12..776784a 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -26,7 +26,8 @@ # # # CONSTANTS # # # LOG = logging.getLogger(__name__) -DEFAULT_FURNESS_TOL= 1e-6 +DEFAULT_FURNESS_TOL = 1e-6 + # pylint:disable=duplicate-code # Furness called with same parameters in single and multi-area @@ -94,7 +95,7 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False - + @dataclass class MultiCostDistribution: """ @@ -118,9 +119,9 @@ class MultiCostDistribution: implemented. """ - #cost_distribution: dict[id, cost_utils.CostDistribution] - #matrix_id_lookup: np.ndarray - #function_params: dict[id, dict[str,float]] + # cost_distribution: dict[id, cost_utils.CostDistribution] + # matrix_id_lookup: np.ndarray + # function_params: dict[id, dict[str,float]] name: str cost_distribution: cost_utils.CostDistribution @@ -130,8 +131,8 @@ class MultiCostDistribution: @classmethod def from_pandas( cls, - category: str|int, - ordered_zones: pd.Series, + category: str | int, + ordered_zones: pd.Series, tld: pd.DataFrame, cat_zone_correspondence: pd.DataFrame, func_params: dict[str, float], @@ -142,15 +143,15 @@ def from_pandas( tld_trips_col: str = "trips", lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", - )->MultiCostDistribution: - #get a list of zones that use this category of TLD + ) -> MultiCostDistribution: + # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col ].to_numpy() zones = ordered_zones.to_numpy() - - #tell user if we have zones in cat->lookup that arent in zones + + # tell user if we have zones in cat->lookup that arent in zones if not np.all(np.isin(cat_zones, zones)): missing_values = cat_zones[~np.isin(cat_zones, zones)] raise ValueError( @@ -167,9 +168,7 @@ def from_pandas( cat_tld, tld_min_col, tld_max_col, tld_avg_col, tld_trips_col, tld_avg_col ) - return cls( - category, cat_cost_distribution, cat_zone_indices, func_params - ) + return cls(category, cat_cost_distribution, cat_zone_indices, func_params) class MultiAreaGravityModelCalibrator(core.GravityModelBase): @@ -379,8 +378,8 @@ def calibrate( Optimal parameters are found using `scipy.optimize.least_squares` to fit the distributed row/col targets to `target_cost_distribution`. - NOTE: The achieved distribution is found by accessing self.achieved - distribution of the object this method is called on. The output of + NOTE: The achieved distribution is found by accessing self.achieved + distribution of the object this method is called on. The output of this method shows the distribution and results for each individual TLD. Parameters @@ -415,7 +414,7 @@ def _jacobian_function( self, init_params: list[float], cost_distributions: list[MultiCostDistribution], - furness_tol:int, + furness_tol: int, diff_step: float, furness_jac: bool, running_log_path, @@ -547,7 +546,10 @@ def _gravity_function( # pylint:enable=too-many-locals def run( - self, distributions: list[MultiCostDistribution], running_log_path: Path, furness_tol=DEFAULT_FURNESS_TOL + self, + distributions: list[MultiCostDistribution], + running_log_path: Path, + furness_tol=DEFAULT_FURNESS_TOL, ) -> dict[int | str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. @@ -566,7 +568,7 @@ def run( cost_distributions=distributions, running_log_path=running_log_path, params_len=params_len, - furness_tol=furness_tol + furness_tol=furness_tol, ) assert self.achieved_cost_dist is not None From c6719895926854e66a97074f47eaaae9f4146010 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Tue, 24 Sep 2024 09:18:25 +0100 Subject: [PATCH 09/50] comment --- src/caf/distribute/gravity_model/multi_area.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 776784a..41a43c5 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -128,6 +128,8 @@ class MultiCostDistribution: zones: np.ndarray function_params: dict[str, float] + #TODO validate params + @classmethod def from_pandas( cls, From e0c7fd65ab068743f78709b01ba344ce70bd75d4 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 24 Sep 2024 12:22:56 +0100 Subject: [PATCH 10/50] added some validation --- .../distribute/gravity_model/multi_area.py | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 41a43c5..ddb00d4 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -128,7 +128,7 @@ class MultiCostDistribution: zones: np.ndarray function_params: dict[str, float] - #TODO validate params + # TODO validate params @classmethod def from_pandas( @@ -209,6 +209,28 @@ def __init__( # TODO move these parameters as inputs of calibrate and run ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) + + if row_targets.sum() != col_targets.sum(): + LOG.info( + "row and column target totals do not match. This is likely to cause Furnessing to fail." + ) + + checks = { + "row targets": row_targets, + "column targets": col_targets, + "cost matrix": cost_matrix, + } + + for name, data in checks.items(): + if np.isnan(data).any(): + LOG.warning(f"There are NaNs in {name}, this may cause a problem.") + if np.isinf(data).any(): + LOG.warning(f"There are Infs in {name}, this may cause a problem.") + + num_zeros = (data == 0).sum() # casting bool as 1,0 + + LOG.info(f"There are {num_zeros} 0s in {name}") + self.row_targets = row_targets self.col_targets = col_targets if len(row_targets) != cost_matrix.shape[0]: @@ -289,6 +311,31 @@ def _calibrate( for val in params: ordered_init_params.append(val) + max_binning = dist.cost_distribution.max_vals.max() + min_binning = dist.cost_distribution.min_vals.min() + + max_cost = self.cost_matrix[dist.zones, :].max() + min_cost = self.cost_matrix[dist.zones, :].min() + + if max_cost > max_binning: + LOG.warning( + f"the maximum cost in the cost matrix for" + f" category {dist.name}, was {max_cost}, " + "whereas the highest bin edge in cost" + f" distribution was {max_binning}, " + "you will not be fitting to trips" + " with a cost greater than the binning" + ) + if min_cost > min_binning: + LOG.warning( + f"the min cost in the cost matrix for" + f" category {dist.name}, was {min_cost}, " + "whereas the lowest bin edge in cost" + f" distribution was {min_binning}, " + "you will not be fitting to trips" + " with a cost less than the binning" + ) + gravity_kwargs: dict[str, Any] = { "running_log_path": running_log_path, "cost_distributions": distributions, From aad80885f11e3f88b91b6de5c9c6ca31ec02d9a9 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 24 Sep 2024 13:43:33 +0100 Subject: [PATCH 11/50] added extra check --- src/caf/distribute/gravity_model/multi_area.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index ddb00d4..475a702 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -216,9 +216,9 @@ def __init__( ) checks = { + "cost matrix": cost_matrix, "row targets": row_targets, "column targets": col_targets, - "cost matrix": cost_matrix, } for name, data in checks.items(): @@ -227,9 +227,16 @@ def __init__( if np.isinf(data).any(): LOG.warning(f"There are Infs in {name}, this may cause a problem.") - num_zeros = (data == 0).sum() # casting bool as 1,0 + num_zeros = (data == 0).sum() # casting bool as 1, 0 + + LOG.info(f"There are {num_zeros} 0s in {name} ({(num_zeros/data.size)*100} %)") + + + + zero_in_both = np.stack(row_targets==0, col_targets==0, axis=1).all(axis=1).sum() - LOG.info(f"There are {num_zeros} 0s in {name}") + LOG.info(f"There are {zero_in_both} zones with both 0 row and column targets.") + self.row_targets = row_targets self.col_targets = col_targets From 10a30706a5bc03f1dea3b231ad92f7292584dd09 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Wed, 25 Sep 2024 13:22:32 +0100 Subject: [PATCH 12/50] minor changes to validation, changed tanner param limits and start values --- src/caf/distribute/cost_functions.py | 4 ++-- src/caf/distribute/gravity_model/multi_area.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/caf/distribute/cost_functions.py b/src/caf/distribute/cost_functions.py index f296d73..1e92c07 100644 --- a/src/caf/distribute/cost_functions.py +++ b/src/caf/distribute/cost_functions.py @@ -34,8 +34,8 @@ def get_cost_function(self) -> CostFunction: if self == BuiltInCostFunction.TANNER: return CostFunction( name=self.name, - params={"alpha": (-5, 5), "beta": (-5, 5)}, - default_params={"alpha": 1, "beta": 1}, + params={"alpha": (-1, 1), "beta": (-1, 1)}, + default_params={"alpha": 0.1, "beta": -0.1}, function=tanner, ) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 475a702..1e24efd 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -223,9 +223,9 @@ def __init__( for name, data in checks.items(): if np.isnan(data).any(): - LOG.warning(f"There are NaNs in {name}, this may cause a problem.") + raise ValueError(f"There are NaNs in {name}") if np.isinf(data).any(): - LOG.warning(f"There are Infs in {name}, this may cause a problem.") + raise ValueError(f"There are Infs in {name}") num_zeros = (data == 0).sum() # casting bool as 1, 0 @@ -233,7 +233,7 @@ def __init__( - zero_in_both = np.stack(row_targets==0, col_targets==0, axis=1).all(axis=1).sum() + zero_in_both = np.stack([row_targets==0, col_targets==0], axis=1).all(axis=1).sum() LOG.info(f"There are {zero_in_both} zones with both 0 row and column targets.") @@ -333,7 +333,7 @@ def _calibrate( "you will not be fitting to trips" " with a cost greater than the binning" ) - if min_cost > min_binning: + if min_cost < min_binning: LOG.warning( f"the min cost in the cost matrix for" f" category {dist.name}, was {min_cost}, " From 6a7df03c7b72867f32fcb34113ef5f97bde83264 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Wed, 25 Sep 2024 13:23:04 +0100 Subject: [PATCH 13/50] ran black --- src/caf/distribute/gravity_model/multi_area.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 1e24efd..28bffbb 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -231,12 +231,9 @@ def __init__( LOG.info(f"There are {num_zeros} 0s in {name} ({(num_zeros/data.size)*100} %)") - - - zero_in_both = np.stack([row_targets==0, col_targets==0], axis=1).all(axis=1).sum() + zero_in_both = np.stack([row_targets == 0, col_targets == 0], axis=1).all(axis=1).sum() LOG.info(f"There are {zero_in_both} zones with both 0 row and column targets.") - self.row_targets = row_targets self.col_targets = col_targets From 2a5163e538c9393ead24c65d18b676384ef41cd4 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Mon, 30 Sep 2024 09:02:37 +0100 Subject: [PATCH 14/50] added a wrapper around the multicost_distribution class --- .../distribute/gravity_model/multi_area.py | 88 ++++++++++++++++++- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 28bffbb..3dd22ad 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -8,7 +8,7 @@ import os from dataclasses import dataclass from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, Iterator # Third Party import numpy as np @@ -98,6 +98,85 @@ class MultiDistInput(BaseConfig): @dataclass class MultiCostDistribution: + distributions: list[MGMCostDistribution] + + @classmethod + def from_pandas( + cls, + ordered_zones: pd.Series, + tld: pd.DataFrame, + cat_zone_correspondence: pd.DataFrame, + func_params: dict[str, float], + tld_cat_col: str = "category", + tld_min_col: str = "from", + tld_max_col: str = "to", + tld_avg_col: str = "av_distance", + tld_trips_col: str = "trips", + lookup_cat_col: str = "category", + lookup_zone_col: str = "zone_id", + ): + + distributions: list[MGMCostDistribution] = [] + + for category in cat_zone_correspondence[lookup_cat_col].unique(): + + distributions.append( + MGMCostDistribution.from_pandas( + category, + pd.Series(ordered_zones), + tld, + cat_zone_correspondence, + func_params, + tld_cat_col, + tld_min_col, + tld_max_col, + tld_avg_col, + tld_trips_col, + lookup_cat_col, + lookup_zone_col, + ) + ) + + cls.validate(distributions) + + return cls(distributions) + + @classmethod + def validate(cls, distributions: list[MGMCostDistribution]): + + if len(distributions) == 0: + raise ValueError("no distributions provided") + + all_zones: Optional[np.ndarray] = None + + for dist in distributions: + if all_zones is None: + all_zones = dist.zones + else: + all_zones = np.concatenate((all_zones, dist.zones)) + + assert all_zones is not None + + + if len(np.unique(all_zones))!=len(all_zones): + raise ValueError("duplicate zones found in the distribution zone definition") + + def __iter__(self)-> Iterator[MGMCostDistribution]: + yield from self.distributions + + def __getitem__(self, x)->MGMCostDistribution: + return self.distributions[x] + + def __len__(self)->int: + return len(self.distributions) + + def copy(self)->MultiCostDistribution: + return MultiCostDistribution(self.distributions) + + + +@dataclass +class MGMCostDistribution: """ Dataclass for storing needed info for a MultiCostDistribution model. @@ -161,7 +240,7 @@ def from_pandas( ) # get the indices - cat_zone_indices = np.where(np.isin(cat_zones, zones)) + cat_zone_indices = np.where(np.isin(zones, cat_zones))[0] # get tld for cat cat_tld = tld[tld[tld_cat_col] == category] @@ -295,7 +374,7 @@ def _create_seed_matrix(self, cost_distributions, cost_args, params_len): # pylint: disable=too-many-locals def _calibrate( self, - distributions: list[MultiCostDistribution], + distributions: MultiCostDistribution, running_log_path: Path, furness_jac: bool = False, diff_step: float = 1e-8, @@ -310,6 +389,7 @@ def _calibrate( ) -> dict[str | int, GravityModelCalibrateResults]: params_len = len(distributions[0].function_params) ordered_init_params = [] + #TODO validate cost distributions for dist in distributions: params = self._order_cost_params(dist.function_params) for val in params: @@ -421,7 +501,7 @@ def _calibrate( def calibrate( self, - distributions: list[MultiCostDistribution], + distributions: MultiCostDistribution, running_log_path: os.PathLike, *args, **kwargs, From f065f569ff47c93e81138a3c04bac98250e257df Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Mon, 30 Sep 2024 09:04:11 +0100 Subject: [PATCH 15/50] ran black --- .../distribute/gravity_model/multi_area.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 3dd22ad..cbaa0eb 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -115,8 +115,8 @@ def from_pandas( lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", ): - - distributions: list[MGMCostDistribution] = [] + + distributions: list[MGMCostDistribution] = [] for category in cat_zone_correspondence[lookup_cat_col].unique(): @@ -140,13 +140,13 @@ def from_pandas( cls.validate(distributions) return cls(distributions) - + @classmethod def validate(cls, distributions: list[MGMCostDistribution]): if len(distributions) == 0: raise ValueError("no distributions provided") - + all_zones: Optional[np.ndarray] = None for dist in distributions: @@ -157,22 +157,20 @@ def validate(cls, distributions: list[MGMCostDistribution]): assert all_zones is not None - - if len(np.unique(all_zones))!=len(all_zones): + if len(np.unique(all_zones)) != len(all_zones): raise ValueError("duplicate zones found in the distribution zone definition") - - def __iter__(self)-> Iterator[MGMCostDistribution]: - yield from self.distributions - - def __getitem__(self, x)->MGMCostDistribution: + + def __iter__(self) -> Iterator[MGMCostDistribution]: + yield from self.distributions + + def __getitem__(self, x) -> MGMCostDistribution: return self.distributions[x] - - def __len__(self)->int: + + def __len__(self) -> int: return len(self.distributions) - - def copy(self)->MultiCostDistribution: - return MultiCostDistribution(self.distributions) + def copy(self) -> MultiCostDistribution: + return MultiCostDistribution(self.distributions) @dataclass @@ -389,7 +387,7 @@ def _calibrate( ) -> dict[str | int, GravityModelCalibrateResults]: params_len = len(distributions[0].function_params) ordered_init_params = [] - #TODO validate cost distributions + # TODO validate cost distributions for dist in distributions: params = self._order_cost_params(dist.function_params) for val in params: From 6580ea6af68704958cf2b4ca95756461d91cbb2a Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 2 Oct 2024 14:46:47 +0100 Subject: [PATCH 16/50] changed multicostdistribution to allow different values for cost function params for GMcategory --- src/caf/distribute/gravity_model/multi_area.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index cbaa0eb..a8771c1 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -106,7 +106,7 @@ def from_pandas( ordered_zones: pd.Series, tld: pd.DataFrame, cat_zone_correspondence: pd.DataFrame, - func_params: dict[str, float], + func_params: dict[str, dict[str, float]], tld_cat_col: str = "category", tld_min_col: str = "from", tld_max_col: str = "to", @@ -126,7 +126,7 @@ def from_pandas( pd.Series(ordered_zones), tld, cat_zone_correspondence, - func_params, + func_params[category], tld_cat_col, tld_min_col, tld_max_col, @@ -222,7 +222,7 @@ def from_pandas( tld_trips_col: str = "trips", lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", - ) -> MultiCostDistribution: + ) -> MGMCostDistribution: # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col From a5b3043fc3c03388aad8c145dce278bf0797d840 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 2 Oct 2024 14:51:21 +0100 Subject: [PATCH 17/50] ran black, type hint update --- src/caf/distribute/gravity_model/multi_area.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index a8771c1..afbb401 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -106,7 +106,7 @@ def from_pandas( ordered_zones: pd.Series, tld: pd.DataFrame, cat_zone_correspondence: pd.DataFrame, - func_params: dict[str, dict[str, float]], + func_params: dict[int | str, dict[str, float]], tld_cat_col: str = "category", tld_min_col: str = "from", tld_max_col: str = "to", From 55f0b5a4a88965190ae8b29c876348e18a586ed3 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 2 Oct 2024 14:54:05 +0100 Subject: [PATCH 18/50] doctring minor update --- src/caf/distribute/gravity_model/multi_area.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index afbb401..367a231 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -515,7 +515,7 @@ def calibrate( Parameters ---------- - distributions: list[MultiCostDistribution], + distributions: MultiCostDistribution, running_log_path: os.PathLike, *args, **kwargs, From 05f707cfa5e6fd7cd5c2dd8bca383e52bd2533dd Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Wed, 2 Oct 2024 20:52:59 +0100 Subject: [PATCH 19/50] bug fix with mult cost distribution from pandas constructor --- src/caf/distribute/gravity_model/multi_area.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index cbaa0eb..693eea4 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -126,7 +126,7 @@ def from_pandas( pd.Series(ordered_zones), tld, cat_zone_correspondence, - func_params, + func_params[category], tld_cat_col, tld_min_col, tld_max_col, From ef397bc650ddc666ca69704b13e7c2bfe7f834fb Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Thu, 3 Oct 2024 15:29:33 +0100 Subject: [PATCH 20/50] added option to truncate final bin in plot _distributions --- src/caf/distribute/gravity_model/core.py | 50 +++++++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 8fba13b..e832b64 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -92,27 +92,65 @@ class GravityModelCalibrateResults(GravityModelResults): cost_function: cost_functions.CostFunction cost_params: dict[str, Any] - def plot_distributions(self) -> figure.Figure: + def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: """ Plot a comparison of the achieved and target distributions. This method returns a matplotlib figure which can be saved or plotted as the user decides. """ + + fig, ax = plt.subplots(figsize=(10, 6)) + + if ( + ( + set(self.cost_distribution.max_vals) + != set(self.target_cost_distribution.max_vals) + ) + or ( + set(self.cost_distribution.min_vals) + != set(self.target_cost_distribution.min_vals) + ) + or ( + set(self.cost_distribution.avg_vals) + != set(self.target_cost_distribution.avg_vals) + ) + ): + raise ValueError("To plot distributions, the target and achieved distributions must have the same binning.") + + max_bin_edge = self.cost_distribution.max_vals + + min_bin_edge = self.cost_distribution.min_vals + + bin_centres = self.cost_distribution.avg_vals + + if truncate_last_bin: + max_bin = max_bin_edge.max() + max_bin_edge[max_bin_edge.argmax()] = ( + min_bin_edge[max_bin_edge.argmax()] * 1.2 + ) + + bin_centres[max_bin_edge.argmax()] = ( + max_bin_edge[max_bin_edge.argmax()] + + min_bin_edge[max_bin_edge.argmax()] + ) / 2 + + fig.text(.8, .025, f"final bin edge cut from {max_bin}", ha='center') + + ax.bar( - self.cost_distribution.avg_vals, + bin_centres, self.cost_distribution.band_share_vals, - width=self.cost_distribution.max_vals - self.cost_distribution.min_vals, + width=max_bin_edge - min_bin_edge, label="Achieved Distribution", color="blue", alpha=0.7, ) ax.bar( - self.cost_distribution.avg_vals, + bin_centres, self.target_cost_distribution.band_share_vals, - width=self.target_cost_distribution.max_vals - - self.target_cost_distribution.min_vals, + width=max_bin_edge - min_bin_edge, label="Target Distribution", color="orange", alpha=0.7, From 5da00e90a3d2a002a35efb86a1ad851ab1ad438b Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Thu, 3 Oct 2024 15:31:12 +0100 Subject: [PATCH 21/50] ran black --- src/caf/distribute/gravity_model/core.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index e832b64..5cef3c0 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -100,7 +100,6 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: as the user decides. """ - fig, ax = plt.subplots(figsize=(10, 6)) if ( @@ -117,7 +116,9 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: != set(self.target_cost_distribution.avg_vals) ) ): - raise ValueError("To plot distributions, the target and achieved distributions must have the same binning.") + raise ValueError( + "To plot distributions, the target and achieved distributions must have the same binning." + ) max_bin_edge = self.cost_distribution.max_vals @@ -127,17 +128,13 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: if truncate_last_bin: max_bin = max_bin_edge.max() - max_bin_edge[max_bin_edge.argmax()] = ( - min_bin_edge[max_bin_edge.argmax()] * 1.2 - ) + max_bin_edge[max_bin_edge.argmax()] = min_bin_edge[max_bin_edge.argmax()] * 1.2 bin_centres[max_bin_edge.argmax()] = ( - max_bin_edge[max_bin_edge.argmax()] - + min_bin_edge[max_bin_edge.argmax()] + max_bin_edge[max_bin_edge.argmax()] + min_bin_edge[max_bin_edge.argmax()] ) / 2 - fig.text(.8, .025, f"final bin edge cut from {max_bin}", ha='center') - + fig.text(0.8, 0.025, f"final bin edge cut from {max_bin}", ha="center") ax.bar( bin_centres, From d3d9171abb5a5531c8a9947c2ab91ef018d66aa0 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Fri, 25 Oct 2024 15:10:11 +0100 Subject: [PATCH 22/50] addressed comments in core --- src/caf/distribute/gravity_model/core.py | 75 ++++++++++++------------ 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 5cef3c0..339e66f 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -93,48 +93,48 @@ class GravityModelCalibrateResults(GravityModelResults): cost_params: dict[str, Any] def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: - """ - Plot a comparison of the achieved and target distributions. + """ Plot a comparison of the achieved and target distributions. This method returns a matplotlib figure which can be saved or plotted as the user decides. - """ + Parameters + ---------- + truncate_last_bin : bool, optional + whether to truncate the graph to 1.2x the lower bin edge, by default False + + Returns + ------- + figure.Figure + the plotted distributions + + Raises + ------ + ValueError + when the target and achieved distributions have different binning + """ + fig, ax = plt.subplots(figsize=(10, 6)) - if ( - ( - set(self.cost_distribution.max_vals) - != set(self.target_cost_distribution.max_vals) - ) - or ( - set(self.cost_distribution.min_vals) - != set(self.target_cost_distribution.min_vals) - ) - or ( - set(self.cost_distribution.avg_vals) - != set(self.target_cost_distribution.avg_vals) - ) - ): + errors = [] + for attr in ("max_vals", "min_vals", "avg_vals"): + if set(getattr(self.cost_distribution, attr)) != set( + getattr(self.target_cost_distribution, attr) + ): + errors.append(attr) + + if len(errors) > 0: raise ValueError( - "To plot distributions, the target and achieved distributions must have the same binning." + "To plot distributions, the target and achieved distributions" + " must have the same binning. The distributions have different " + + " and ".join(errors) ) max_bin_edge = self.cost_distribution.max_vals - min_bin_edge = self.cost_distribution.min_vals - bin_centres = self.cost_distribution.avg_vals - if truncate_last_bin: - max_bin = max_bin_edge.max() - max_bin_edge[max_bin_edge.argmax()] = min_bin_edge[max_bin_edge.argmax()] * 1.2 - - bin_centres[max_bin_edge.argmax()] = ( - max_bin_edge[max_bin_edge.argmax()] + min_bin_edge[max_bin_edge.argmax()] - ) / 2 - - fig.text(0.8, 0.025, f"final bin edge cut from {max_bin}", ha="center") + ax.bar( bin_centres, @@ -153,6 +153,11 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: alpha=0.7, ) + if truncate_last_bin: + top_min_bin = min_bin_edge.max() + ax.set_xlim(0, top_min_bin[-1] * 1.2) + fig.text(0.8, 0.025, f"final bin edge cut from {max_bin_edge.max()}", ha="center") + ax.set_xlabel("Cost") ax.set_ylabel("Trips") ax.set_title("Distribution Comparison") @@ -162,8 +167,7 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: @property def summary(self) -> pd.Series: - """summary of the GM calibration parameters as a series. - + """Summary of the GM calibration parameters as a series. Outputs the gravity model achieved parameters and the convergence. @@ -172,9 +176,6 @@ def summary(self) -> pd.Series: pd.DataFrame a summary of the calibration """ - output_params = self.cost_params.copy() - output_params["convergence"] = self.cost_convergence - return pd.Series(output_params) @dataclasses.dataclass @@ -215,11 +216,9 @@ class GravityModelRunResults(GravityModelResults): @property def summary(self) -> pd.Series: - """summary of the GM run parameters as a series. + """Summary of the GM run parameters as a series. - - Outputs the gravity model parameters used to generate the distribution. - Parameters + Outputs the gravity model parameters used to generate the distribution. Returns ------- From 9e4e846442e0e2337a24f58a719a2be455486046 Mon Sep 17 00:00:00 2001 From: Kieran Fishwick Date: Fri, 25 Oct 2024 15:13:31 +0100 Subject: [PATCH 23/50] Update .gitignore Co-authored-by: Matt Buckley <165161571+MattBuckley-TfN@users.noreply.github.com> --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index db26a51..d9fdcb4 100644 --- a/.gitignore +++ b/.gitignore @@ -132,5 +132,5 @@ dmypy.json # Pyre type checker .pyre/ - -settings.json +# VS code settings +.vscode/ From bea78b2c8c58bba4e33c28ba5094a71037722b67 Mon Sep 17 00:00:00 2001 From: Kieran Fishwick Date: Fri, 25 Oct 2024 15:13:57 +0100 Subject: [PATCH 24/50] Update src/caf/distribute/gravity_model/multi_area.py Co-authored-by: Matt Buckley <165161571+MattBuckley-TfN@users.noreply.github.com> --- src/caf/distribute/gravity_model/multi_area.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 367a231..7b8be6c 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -548,8 +548,8 @@ def _jacobian_function( furness_tol: int, diff_step: float, furness_jac: bool, - running_log_path, - params_len, + running_log_path: Path, + params_len: int, ): del running_log_path # Build empty jacobian matrix From 02ed30e28817b2a157ff346cfcf376c0cffa1398 Mon Sep 17 00:00:00 2001 From: Kieran Fishwick Date: Fri, 25 Oct 2024 15:14:29 +0100 Subject: [PATCH 25/50] Update src/caf/distribute/gravity_model/multi_area.py Co-authored-by: Matt Buckley <165161571+MattBuckley-TfN@users.noreply.github.com> --- src/caf/distribute/gravity_model/multi_area.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 7b8be6c..349477c 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -678,7 +678,7 @@ def _gravity_function( # pylint:enable=too-many-locals def run( self, - distributions: list[MultiCostDistribution], + distributions: MultiCostDistribution, running_log_path: Path, furness_tol=DEFAULT_FURNESS_TOL, ) -> dict[int | str, GravityModelCalibrateResults]: From 247531d261de57786fc705f6d6ccb617e9e3eb16 Mon Sep 17 00:00:00 2001 From: Kieran Fishwick Date: Fri, 25 Oct 2024 15:14:54 +0100 Subject: [PATCH 26/50] Update src/caf/distribute/gravity_model/multi_area.py Co-authored-by: Matt Buckley <165161571+MattBuckley-TfN@users.noreply.github.com> --- src/caf/distribute/gravity_model/multi_area.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 349477c..45f94a5 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -680,7 +680,7 @@ def run( self, distributions: MultiCostDistribution, running_log_path: Path, - furness_tol=DEFAULT_FURNESS_TOL, + furness_tol: float = DEFAULT_FURNESS_TOL, ) -> dict[int | str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. From 1080523e0d7603ded57817ef595222bd705596d7 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Fri, 25 Oct 2024 17:00:38 +0100 Subject: [PATCH 27/50] working through comments --- .../distribute/gravity_model/multi_area.py | 148 +++++++++++++++--- 1 file changed, 130 insertions(+), 18 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 45f94a5..6149d9a 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Optional, Iterator +import warnings # Third Party import numpy as np @@ -21,7 +22,6 @@ from caf.distribute.gravity_model import core from caf.distribute.gravity_model.core import ( GravityModelCalibrateResults, - GravityModelRunResults, ) # # # CONSTANTS # # # @@ -98,6 +98,13 @@ class MultiDistInput(BaseConfig): @dataclass class MultiCostDistribution: + """Cost distributions to be used for the multi-cost distribution gravity model + + Parameters + ---------- + distributions: list[MGMCostDistribution] + Distributions to be used for the multicost distributions + """ distributions: list[MGMCostDistribution] @classmethod @@ -107,6 +114,7 @@ def from_pandas( tld: pd.DataFrame, cat_zone_correspondence: pd.DataFrame, func_params: dict[int | str, dict[str, float]], + *, tld_cat_col: str = "category", tld_min_col: str = "from", tld_max_col: str = "to", @@ -114,12 +122,57 @@ def from_pandas( tld_trips_col: str = "trips", lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", - ): + ) -> MultiCostDistribution: + """constructor using pandas dataframes + + Parameters + ---------- + ordered_zones : pd.Series + list of zones in the same order as other inputs + tld : pd.DataFrame + tld data - should contain the tlds for each distribution + labeled by the `tld_cat_col` + cat_zone_correspondence : pd.DataFrame + lookup between categories values within `tld` and zones which + use the corresponding distribution + func_params : dict[int | str, dict[str, float]] + starting/run cost function params to use for each distribution + key: distribution category, value: dict[param name, param value] + tld_cat_col : str, optional + column name for the category column in `tld`, by default "category" + tld_min_col : str, optional + column name for the min bin edge column in `tld`, by default "from" + tld_max_col : str, optional + column name for the max bin edge column in `tld`, by default "to" + tld_avg_col : str, optional + column name for the average distance column in `tld`, by default "av_distance" + tld_trips_col : str, optional + column name for the trips column in `tld`, by default "trips" + lookup_cat_col : str, optional + column name for the category column in `cat_zone_correspondence`, by default "category" + lookup_zone_col : str, optional + column name for the zone column in `cat_zone_correspondence`, by default "zone_id" + + Returns + ------- + MultiCostDistribution + + + Raises + ------ + KeyError + when a category value is not founf in the function parameter keys + + See Also + -------- + `validate` + """ distributions: list[MGMCostDistribution] = [] for category in cat_zone_correspondence[lookup_cat_col].unique(): - + if category not in func_params: + raise KeyError(f"function parameters not provided for {category = }") distributions.append( MGMCostDistribution.from_pandas( category, @@ -127,13 +180,13 @@ def from_pandas( tld, cat_zone_correspondence, func_params[category], - tld_cat_col, - tld_min_col, - tld_max_col, - tld_avg_col, - tld_trips_col, - lookup_cat_col, - lookup_zone_col, + tld_cat_col = tld_cat_col, + tld_min_col = tld_min_col, + tld_max_col = tld_max_col, + tld_avg_col = tld_avg_col, + tld_trips_col = tld_trips_col, + lookup_cat_col = lookup_cat_col, + lookup_zone_col = lookup_zone_col, ) ) @@ -143,6 +196,20 @@ def from_pandas( @classmethod def validate(cls, distributions: list[MGMCostDistribution]): + """ Validates the distributions passed + + Parameters + ---------- + distributions : list[MGMCostDistribution] + Distributions to validate + + Raises + ------ + ValueError + The length of the list of the distributions passed is 0 + ValueError + The same zones are found in multiple distributions + """ if len(distributions) == 0: raise ValueError("no distributions provided") @@ -158,7 +225,7 @@ def validate(cls, distributions: list[MGMCostDistribution]): assert all_zones is not None if len(np.unique(all_zones)) != len(all_zones): - raise ValueError("duplicate zones found in the distribution zone definition") + raise ValueError("duplicate found in the distribution zone definition") def __iter__(self) -> Iterator[MGMCostDistribution]: yield from self.distributions @@ -200,12 +267,14 @@ class MGMCostDistribution: # matrix_id_lookup: np.ndarray # function_params: dict[id, dict[str,float]] - name: str + name: str|int cost_distribution: cost_utils.CostDistribution zones: np.ndarray function_params: dict[str, float] - # TODO validate params + # TODO(kf) validate params + + # TODO(kf) validate cost distributions @classmethod def from_pandas( @@ -215,6 +284,7 @@ def from_pandas( tld: pd.DataFrame, cat_zone_correspondence: pd.DataFrame, func_params: dict[str, float], + *, tld_cat_col: str = "category", tld_min_col: str = "from", tld_max_col: str = "to", @@ -223,6 +293,47 @@ def from_pandas( lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", ) -> MGMCostDistribution: + """constructor that uses pandas dataframes and series + + Parameters + ---------- + category : str | int + distribution category, used to label gravity model run + ordered_zones : pd.Series + zones ordered in the same way as other inputs + tld : pd.DataFrame + tld data - should contain the tlds for each distribution + labeled by the `tld_cat_col` + cat_zone_correspondence : pd.DataFrame + lookup between categories values within `tld` and zones which + use the corresponding distribution + func_params : dict[int | str, dict[str, float]] + starting/run cost function params to use for each distribution + key: distribution category, value: dict[param name, param value] + tld_cat_col : str, optional + column name for the category column in `tld`, by default "category" + tld_min_col : str, optional + column name for the min bin edge column in `tld`, by default "from" + tld_max_col : str, optional + column name for the max bin edge column in `tld`, by default "to" + tld_avg_col : str, optional + column name for the average distance column in `tld`, by default "av_distance" + tld_trips_col : str, optional + column name for the trips column in `tld`, by default "trips" + lookup_cat_col : str, optional + column name for the category column in `cat_zone_correspondence`, by default "category" + lookup_zone_col : str, optional + column name for the zone column in `cat_zone_correspondence`, by default "zone_id" + + Returns + ------- + MGMCostDistribution + + Raises + ------ + ValueError + if zones in `cat_zone_correspondence` are not present in `ordered_zones` + """ # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col @@ -288,8 +399,9 @@ def __init__( super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) if row_targets.sum() != col_targets.sum(): - LOG.info( + warnings.warn( "row and column target totals do not match. This is likely to cause Furnessing to fail." + f" Difference (row targets - col targets) = {round(row_targets.sum() - col_targets.sum(),2)}" ) checks = { @@ -306,11 +418,11 @@ def __init__( num_zeros = (data == 0).sum() # casting bool as 1, 0 - LOG.info(f"There are {num_zeros} 0s in {name} ({(num_zeros/data.size)*100} %)") + LOG.info("There are %s 0s in %s (%s %)", num_zeros, name, (num_zeros/data.size)*100) zero_in_both = np.stack([row_targets == 0, col_targets == 0], axis=1).all(axis=1).sum() - LOG.info(f"There are {zero_in_both} zones with both 0 row and column targets.") + LOG.info("There are %s zones with both 0 row and column targets.", zero_in_both) self.row_targets = row_targets self.col_targets = col_targets @@ -387,7 +499,7 @@ def _calibrate( ) -> dict[str | int, GravityModelCalibrateResults]: params_len = len(distributions[0].function_params) ordered_init_params = [] - # TODO validate cost distributions + for dist in distributions: params = self._order_cost_params(dist.function_params) for val in params: @@ -409,7 +521,7 @@ def _calibrate( " with a cost greater than the binning" ) if min_cost < min_binning: - LOG.warning( + warnings.warn( f"the min cost in the cost matrix for" f" category {dist.name}, was {min_cost}, " "whereas the lowest bin edge in cost" From 54984f4fedaa9cc1e6c105454429ebeb6a6370a4 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 09:22:18 +0000 Subject: [PATCH 28/50] addressed most comments, updated tests to the new API --- src/caf/distribute/gravity_model/core.py | 14 ++- .../distribute/gravity_model/multi_area.py | 90 +++++++++++-------- .../tanner/calibrate/band_share.csv | 20 ++--- .../tanner/calibrate/best_params.json | 2 +- .../tanner/calibrate/convergence.csv | 2 +- .../tanner/calibrate/distribution.csv | 10 +-- .../tanner/calibrate/init_params.json | 2 +- .../tanner/calibrate/residuals.csv | 20 ++--- tests/gravity_model/test_multi_area.py | 38 ++++++-- tests/gravity_model/test_single_area.py | 3 +- 10 files changed, 120 insertions(+), 81 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 339e66f..74161ff 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -93,7 +93,7 @@ class GravityModelCalibrateResults(GravityModelResults): cost_params: dict[str, Any] def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: - """ Plot a comparison of the achieved and target distributions. + """Plot a comparison of the achieved and target distributions. This method returns a matplotlib figure which can be saved or plotted as the user decides. @@ -112,8 +112,8 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: ------ ValueError when the target and achieved distributions have different binning - """ - + """ + fig, ax = plt.subplots(figsize=(10, 6)) errors = [] @@ -134,8 +134,6 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: min_bin_edge = self.cost_distribution.min_vals bin_centres = self.cost_distribution.avg_vals - - ax.bar( bin_centres, self.cost_distribution.band_share_vals, @@ -155,7 +153,7 @@ def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: if truncate_last_bin: top_min_bin = min_bin_edge.max() - ax.set_xlim(0, top_min_bin[-1] * 1.2) + ax.set_xlim(0, top_min_bin[-1] * 1.2) fig.text(0.8, 0.025, f"final bin edge cut from {max_bin_edge.max()}", ha="center") ax.set_xlabel("Cost") @@ -216,9 +214,9 @@ class GravityModelRunResults(GravityModelResults): @property def summary(self) -> pd.Series: - """Summary of the GM run parameters as a series. + """Summary of the GM run parameters as a series. - Outputs the gravity model parameters used to generate the distribution. + Outputs the gravity model parameters used to generate the distribution. Returns ------- diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 6149d9a..c2836c3 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- """Implementation of a self-calibrating single area gravity model.""" -# Built-Ins from __future__ import annotations +# Built-Ins import functools import logging import os +import warnings from dataclasses import dataclass from pathlib import Path -from typing import Any, Optional, Iterator -import warnings +from typing import Any, Iterator, Optional # Third Party import numpy as np @@ -20,9 +20,7 @@ # Local Imports from caf.distribute import cost_functions, furness from caf.distribute.gravity_model import core -from caf.distribute.gravity_model.core import ( - GravityModelCalibrateResults, -) +from caf.distribute.gravity_model.core import GravityModelCalibrateResults # # # CONSTANTS # # # LOG = logging.getLogger(__name__) @@ -104,7 +102,8 @@ class MultiCostDistribution: ---------- distributions: list[MGMCostDistribution] Distributions to be used for the multicost distributions - """ + """ + distributions: list[MGMCostDistribution] @classmethod @@ -136,8 +135,8 @@ def from_pandas( lookup between categories values within `tld` and zones which use the corresponding distribution func_params : dict[int | str, dict[str, float]] - starting/run cost function params to use for each distribution - key: distribution category, value: dict[param name, param value] + starting/run cost function params to use for each distribution + key: distribution category, value: dict[param name, param value] tld_cat_col : str, optional column name for the category column in `tld`, by default "category" tld_min_col : str, optional @@ -157,21 +156,21 @@ def from_pandas( ------- MultiCostDistribution - + Raises ------ KeyError - when a category value is not founf in the function parameter keys + when a category value is not founf in the function parameter keys See Also -------- `validate` - """ + """ distributions: list[MGMCostDistribution] = [] for category in cat_zone_correspondence[lookup_cat_col].unique(): - if category not in func_params: + if category not in func_params: raise KeyError(f"function parameters not provided for {category = }") distributions.append( MGMCostDistribution.from_pandas( @@ -180,13 +179,13 @@ def from_pandas( tld, cat_zone_correspondence, func_params[category], - tld_cat_col = tld_cat_col, - tld_min_col = tld_min_col, - tld_max_col = tld_max_col, - tld_avg_col = tld_avg_col, - tld_trips_col = tld_trips_col, - lookup_cat_col = lookup_cat_col, - lookup_zone_col = lookup_zone_col, + tld_cat_col=tld_cat_col, + tld_min_col=tld_min_col, + tld_max_col=tld_max_col, + tld_avg_col=tld_avg_col, + tld_trips_col=tld_trips_col, + lookup_cat_col=lookup_cat_col, + lookup_zone_col=lookup_zone_col, ) ) @@ -196,7 +195,7 @@ def from_pandas( @classmethod def validate(cls, distributions: list[MGMCostDistribution]): - """ Validates the distributions passed + """Validates the distributions passed Parameters ---------- @@ -209,7 +208,7 @@ def validate(cls, distributions: list[MGMCostDistribution]): The length of the list of the distributions passed is 0 ValueError The same zones are found in multiple distributions - """ + """ if len(distributions) == 0: raise ValueError("no distributions provided") @@ -267,7 +266,7 @@ class MGMCostDistribution: # matrix_id_lookup: np.ndarray # function_params: dict[id, dict[str,float]] - name: str|int + name: str | int cost_distribution: cost_utils.CostDistribution zones: np.ndarray function_params: dict[str, float] @@ -298,7 +297,7 @@ def from_pandas( Parameters ---------- category : str | int - distribution category, used to label gravity model run + distribution category, used to label gravity model run ordered_zones : pd.Series zones ordered in the same way as other inputs tld : pd.DataFrame @@ -308,8 +307,8 @@ def from_pandas( lookup between categories values within `tld` and zones which use the corresponding distribution func_params : dict[int | str, dict[str, float]] - starting/run cost function params to use for each distribution - key: distribution category, value: dict[param name, param value] + starting/run cost function params to use for each distribution + key: distribution category, value: dict[param name, param value] tld_cat_col : str, optional column name for the category column in `tld`, by default "category" tld_min_col : str, optional @@ -332,8 +331,8 @@ def from_pandas( Raises ------ ValueError - if zones in `cat_zone_correspondence` are not present in `ordered_zones` - """ + if zones in `cat_zone_correspondence` are not present in `ordered_zones` + """ # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col @@ -418,7 +417,9 @@ def __init__( num_zeros = (data == 0).sum() # casting bool as 1, 0 - LOG.info("There are %s 0s in %s (%s %)", num_zeros, name, (num_zeros/data.size)*100) + LOG.info( + "There are %s 0s in %s (%s %)", num_zeros, name, (num_zeros / data.size) * 100 + ) zero_in_both = np.stack([row_targets == 0, col_targets == 0], axis=1).all(axis=1).sum() @@ -499,7 +500,7 @@ def _calibrate( ) -> dict[str | int, GravityModelCalibrateResults]: params_len = len(distributions[0].function_params) ordered_init_params = [] - + for dist in distributions: params = self._order_cost_params(dist.function_params) for val in params: @@ -627,8 +628,10 @@ def calibrate( Parameters ---------- - distributions: MultiCostDistribution, + distributions: MultiCostDistribution + distributions to use for the calibrations running_log_path: os.PathLike, + path to a csv to log the model iterations and results *args, **kwargs, Returns @@ -656,7 +659,7 @@ def calibrate( def _jacobian_function( self, init_params: list[float], - cost_distributions: list[MultiCostDistribution], + cost_distributions: MultiCostDistribution, furness_tol: int, diff_step: float, furness_jac: bool, @@ -723,12 +726,12 @@ def _jacobian_function( def _gravity_function( self, - init_params, - cost_distributions, - furness_tol, - running_log_path, - params_len, - diff_step=0, + init_params: dict[str, float], + cost_distributions: MGMCostDistribution, + furness_tol: float, + running_log_path: os.PathLike, + params_len: int, + diff_step: int = 0, **_, ): del diff_step @@ -799,6 +802,17 @@ def run( This should be done when you have calibrating previously to find the correct parameters for the cost function. + + Parameters + ---------- + distributions : MultiCostDistribution + Distributions to use to run the gravity model + running_log_path : Path + Csv path to log results and info + furness_tol : float, optional + tolerance for difference in target and achieved value, + at which to stop furnessing, by default DEFAULT_FURNESS_TOL + """ params_len = len(distributions[0].function_params) cost_args = [] diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/band_share.csv b/tests/gravity_model/data/small_and_simple/tanner/calibrate/band_share.csv index be969f2..4659e03 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/band_share.csv +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/band_share.csv @@ -1,10 +1,10 @@ -1.87107466097356000E-01 -2.61226937457715000E-01 -3.14059946560420000E-01 -1.28081255847000000E-01 -5.26257244646151000E-02 -3.51900912278132000E-02 -2.17085783450793000E-02 -0.00000000000000000E+00 -0.00000000000000000E+00 -0.00000000000000000E+00 +0.18659723285411958 +0.2604281147910556 +0.31313501088371376 +0.12916376836331442 +0.05252735707438472 +0.03584492791638932 +0.022303588117022634 +0.0 +0.0 +0.0 diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/best_params.json b/tests/gravity_model/data/small_and_simple/tanner/calibrate/best_params.json index 5283ab9..5da0e19 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/best_params.json +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/best_params.json @@ -1 +1 @@ -{"alpha": 0.23493815493647896, "beta": -0.04890898898106072} \ No newline at end of file +{"alpha": 0.22740538185992853, "beta": -0.048000303441957576} \ No newline at end of file diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/convergence.csv b/tests/gravity_model/data/small_and_simple/tanner/calibrate/convergence.csv index 5c1f0c5..776f66e 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/convergence.csv +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/convergence.csv @@ -1 +1 @@ -0.964137289 +0.9641013548903947 diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/distribution.csv b/tests/gravity_model/data/small_and_simple/tanner/calibrate/distribution.csv index 5dbe17b..a208467 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/distribution.csv +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/distribution.csv @@ -1,5 +1,5 @@ -97.4101346338488,78.4948263628220,0.0000000088069,31.0716270331579,11.0234119613643 -85.1437051185380,88.2004717234723,0.0000000086316,36.0785166330549,18.5773065163029 -0.0000000095534,0.0000000086321,95.4985904609788,56.5014095077785,0.0000000130571 -28.9337963389644,30.9726657873415,48.5025556411786,58.3963728235764,73.1946094089389 -10.5114977556484,16.3312639795707,0.0000000114778,74.9525195875706,52.2047186657323 +97.1560202838155,78.17871663799052,1.329688557490335e-08,31.348195863527256,11.317067201369866 +84.80916522349123,87.9484346905578,1.3044508589762575e-08,36.332113875439624,18.9102861974668 +1.4435151012466811e-08,1.3054037970350208e-08,95.35680798764004,56.643191965287954,1.9582804455929795e-08 +29.225908225507546,31.22424024022602,48.644280412339235,58.24002830502321,72.66554281690401 +10.808092208694884,16.647882292342402,1.7227356206447182e-08,74.43688726870683,52.10713821302855 diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/init_params.json b/tests/gravity_model/data/small_and_simple/tanner/calibrate/init_params.json index 545b595..4c5523e 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/init_params.json +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/init_params.json @@ -1 +1 @@ -{"alpha": 4.71919453869341, "beta": -1.66937805357705} \ No newline at end of file +{"alpha": 0.71919453869341, "beta": -0.66937805357705} \ No newline at end of file diff --git a/tests/gravity_model/data/small_and_simple/tanner/calibrate/residuals.csv b/tests/gravity_model/data/small_and_simple/tanner/calibrate/residuals.csv index 2633d4a..68e496f 100644 --- a/tests/gravity_model/data/small_and_simple/tanner/calibrate/residuals.csv +++ b/tests/gravity_model/data/small_and_simple/tanner/calibrate/residuals.csv @@ -1,10 +1,10 @@ -0.0124893339026437000000 -0.0049020625422841400000 --0.0368422465604202000000 --0.0040893558470000400000 -0.0471726755353848000000 --0.0140206912278132000000 --0.0096117783450793300000 -0.0000000000000000000000 -0.0000000000000000000000 -0.0000000000000000000000 +0.012999567145880409 +0.005700885208944351 +-0.035917310883713804 +-0.00517186836331443 +0.04727104292561528 +-0.01467552791638932 +-0.010206788117022634 +0.0 +0.0 +0.0 diff --git a/tests/gravity_model/test_multi_area.py b/tests/gravity_model/test_multi_area.py index 74686ca..c649ebb 100644 --- a/tests/gravity_model/test_multi_area.py +++ b/tests/gravity_model/test_multi_area.py @@ -147,8 +147,34 @@ def fixture_jac_furn(data_dir, mock_dir): return conf +@pytest.fixture(name="multi_tld", scope="session") +def _multi_tld(data_dir, mock_dir): + tld_lookup = pd.read_csv(data_dir / "distributions_lookup.csv") + + ordered_zones = tld_lookup["zone"] + + func_parameters = {} + for cat in tld_lookup["cat"].unique(): + func_parameters[cat] = {"mu": 1, "sigma": 2} + + multitld = gm.MultiCostDistribution.from_pandas( + ordered_zones, + pd.read_csv(data_dir / "distributions.csv"), + tld_lookup, + func_parameters, + tld_cat_col="cat", + tld_min_col="lower", + tld_max_col="upper", + tld_avg_col="avg", + tld_trips_col="trips", + lookup_cat_col="cat", + lookup_zone_col="zone", + ) + return multitld + + @pytest.fixture(name="cal_no_furness", scope="session") -def fixture_cal_no_furness(data_dir, infilled, no_furness_jac_conf, trip_ends, mock_dir): +def fixture_cal_no_furness(data_dir, infilled, multi_tld, trip_ends, mock_dir): row_targets = trip_ends["origin"].values col_targets = trip_ends["destination"].values model = gm.MultiAreaGravityModelCalibrator( @@ -156,9 +182,10 @@ def fixture_cal_no_furness(data_dir, infilled, no_furness_jac_conf, trip_ends, m col_targets=col_targets, cost_matrix=infilled, cost_function=cost_functions.BuiltInCostFunction.LOG_NORMAL.get_cost_function(), - params=no_furness_jac_conf, ) - results = model.calibrate(running_log_path=mock_dir / "temp_log.csv") + results = model.calibrate( + multi_tld, running_log_path=mock_dir / "temp_log.csv", furness_jac=False + ) return results @@ -171,7 +198,7 @@ def test_infill_costs(self, infilled_from_code, infilled_expected): class TestDist: @pytest.fixture(name="cal_furness", scope="session") - def fixture_cal_furness(self, data_dir, infilled, furness_jac_conf, trip_ends, mock_dir): + def fixture_cal_furness(self, data_dir, infilled, multi_tld, trip_ends, mock_dir): row_targets = trip_ends["origin"].values col_targets = trip_ends["destination"].values model = gm.MultiAreaGravityModelCalibrator( @@ -179,9 +206,8 @@ def fixture_cal_furness(self, data_dir, infilled, furness_jac_conf, trip_ends, m col_targets=col_targets, cost_matrix=infilled, cost_function=cost_functions.BuiltInCostFunction.LOG_NORMAL.get_cost_function(), - params=furness_jac_conf, ) - results = model.calibrate(running_log_path=mock_dir / "temp_log.csv") + results = model.calibrate(multi_tld, running_log_path=mock_dir / "temp_log.csv") return results @pytest.mark.parametrize("area", ["City", "Town", "External", "Village"]) diff --git a/tests/gravity_model/test_single_area.py b/tests/gravity_model/test_single_area.py index bbdb51f..ec0595b 100644 --- a/tests/gravity_model/test_single_area.py +++ b/tests/gravity_model/test_single_area.py @@ -20,7 +20,6 @@ from caf.distribute import cost_functions from caf.distribute.gravity_model import ( GravityModelCalibrateResults, - GravityModelResults, GravityModelRunResults, SingleAreaGravityModelCalibrator, ) @@ -438,6 +437,7 @@ def fixture_simple_tanner_calib(tmp_path) -> GMCalibrateResults: """Load in the small_and_simple log normal test""" path = tmp_path / "simple_tanner_calib" path.mkdir() + # TODO(KF) sense check these data sets make sure all is good return simple_gm_calib_results( tmp_path=path, cost_function=cost_functions.BuiltInCostFunction.TANNER.get_cost_function(), @@ -563,6 +563,7 @@ def test_normal_calibrate(self, fixture_str, request): init_params=run_and_results.calib_init_params, n_random_tries=0, ) + run_and_results.assert_results( gm_results=gm_results, ) From d943bf0f9892e111a25881f8cc29cb8a9d82e3fc Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 10:15:48 +0000 Subject: [PATCH 29/50] created GMCalibParams as an input to calibrate and reformatted --- .../distribute/gravity_model/multi_area.py | 120 +++++++++--------- 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index c2836c3..3ae2ac8 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -93,6 +93,17 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False +@dataclass +class GMCalibParams: + furness_jac: bool = False, + diff_step: float = 1e-8, + ftol: float = 1e-4, + xtol: float = 1e-4, + furness_tol=DEFAULT_FURNESS_TOL, + grav_max_iters: int = 100, + failure_tol: float = 0, + default_retry: bool = True, + @dataclass class MultiCostDistribution: @@ -483,25 +494,54 @@ def _create_seed_matrix(self, cost_distributions, cost_args, params_len): return base_mat # pylint: disable=too-many-locals - def _calibrate( + def calibrate( self, distributions: MultiCostDistribution, running_log_path: Path, - furness_jac: bool = False, - diff_step: float = 1e-8, - ftol: float = 1e-4, - xtol: float = 1e-4, - furness_tol=DEFAULT_FURNESS_TOL, - grav_max_iters: int = 100, - failure_tol: float = 0, - default_retry: bool = True, + gm_params: GMCalibParams, verbose: int = 0, **kwargs, ) -> dict[str | int, GravityModelCalibrateResults]: + """Find the optimal parameters for self.cost_function. + + Optimal parameters are found using `scipy.optimize.least_squares` + to fit the distributed row/col targets to `target_cost_distribution`. + + NOTE: The achieved distribution is found by accessing self.achieved + distribution of the object this method is called on. The output of + this method shows the distribution and results for each individual TLD. + + Parameters + ---------- + distributions: MultiCostDistribution + distributions to use for the calibrations + running_log_path: os.PathLike, + path to a csv to log the model iterations and results + gm_params: GMCalibParams + defines the detailed parameters, see `GMCalibParams` documentation for more info + *args, + **kwargs, + Returns + ------- + dict[str | int, GravityModelCalibrateResults]: + containings the achieved distributions for each tld category. To access + the combined distribution use self.achieved_distribution + + See Also + -------- + `caf.distribute.furness.doubly_constrained_furness()` + `scipy.optimize.least_squares()` + `caf.distribute.gravity_model.multi_area.GMCalibParams` + """ + + self._validate_running_log(running_log_path) + self._initialise_internal_params() + params_len = len(distributions[0].function_params) ordered_init_params = [] for dist in distributions: + self.cost_function.validate_params(dist.function_params) params = self._order_cost_params(dist.function_params) for val in params: ordered_init_params.append(val) @@ -534,10 +574,10 @@ def _calibrate( gravity_kwargs: dict[str, Any] = { "running_log_path": running_log_path, "cost_distributions": distributions, - "diff_step": diff_step, + "diff_step": gm_params.diff_step, "params_len": params_len, - "furness_jac": furness_jac, - "furness_tol": furness_tol, + "furness_jac": gm_params.furness_jac, + "furness_tol": gm_params.furness_tol, } optimise_cost_params = functools.partial( optimize.least_squares, @@ -549,9 +589,9 @@ def _calibrate( ), jac=self._jacobian_function, verbose=verbose, - ftol=ftol, - xtol=xtol, - max_nfev=grav_max_iters, + ftol=gm_params.ftol, + xtol=gm_params.xtol, + max_nfev=gm_params.grav_max_iters, kwargs=gravity_kwargs | kwargs, ) result = optimise_cost_params(x0=ordered_init_params) @@ -568,14 +608,14 @@ def _calibrate( best_convergence = self.achieved_convergence best_params = result.x - if (not all(self.achieved_convergence) >= failure_tol) and default_retry: + if (not all(self.achieved_convergence) >= gm_params.failure_tol) and gm_params.default_retry: LOG.info( "%sachieved a convergence of %s, " "however the failure tolerance is set to %s. Trying again with " "default cost parameters.", self.unique_id, self.achieved_convergence, - failure_tol, + gm_params.failure_tol, ) self._attempt_id += 1 ordered_init_params = self._order_cost_params(self.cost_function.default_params) @@ -610,52 +650,6 @@ def _calibrate( results[dist.name] = result_i return results - def calibrate( - self, - distributions: MultiCostDistribution, - running_log_path: os.PathLike, - *args, - **kwargs, - ) -> dict[str | int, GravityModelCalibrateResults]: - """Find the optimal parameters for self.cost_function. - - Optimal parameters are found using `scipy.optimize.least_squares` - to fit the distributed row/col targets to `target_cost_distribution`. - - NOTE: The achieved distribution is found by accessing self.achieved - distribution of the object this method is called on. The output of - this method shows the distribution and results for each individual TLD. - - Parameters - ---------- - distributions: MultiCostDistribution - distributions to use for the calibrations - running_log_path: os.PathLike, - path to a csv to log the model iterations and results - *args, - **kwargs, - Returns - ------- - dict[str | int, GravityModelCalibrateResults]: - An instance of GravityModelCalibrateResults containing the - results of this run. - - See Also - -------- - `caf.distribute.furness.doubly_constrained_furness()` - `scipy.optimize.least_squares()` - """ - for dist in distributions: - self.cost_function.validate_params(dist.function_params) - self._validate_running_log(running_log_path) - self._initialise_internal_params() - return self._calibrate( # type: ignore - distributions, - running_log_path, - *args, - **kwargs, - ) - def _jacobian_function( self, init_params: list[float], From bcc97c534bb8c0d3f98312ea3004d6790acef4c6 Mon Sep 17 00:00:00 2001 From: KieranFishwick Date: Tue, 29 Oct 2024 10:58:41 +0000 Subject: [PATCH 30/50] bug fixes --- .../distribute/gravity_model/multi_area.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 3ae2ac8..351881b 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -93,16 +93,17 @@ class MultiDistInput(BaseConfig): furness_tolerance: float = 1e-6 furness_jac: float = False + @dataclass class GMCalibParams: - furness_jac: bool = False, - diff_step: float = 1e-8, - ftol: float = 1e-4, - xtol: float = 1e-4, - furness_tol=DEFAULT_FURNESS_TOL, - grav_max_iters: int = 100, - failure_tol: float = 0, - default_retry: bool = True, + furness_jac: bool = False + diff_step: float = 1e-8 + ftol: float = 1e-4 + xtol: float = 1e-4 + furness_tol: float = DEFAULT_FURNESS_TOL + grav_max_iters: int = 100 + failure_tol: float = 0 + default_retry: bool = True @dataclass @@ -429,7 +430,7 @@ def __init__( num_zeros = (data == 0).sum() # casting bool as 1, 0 LOG.info( - "There are %s 0s in %s (%s %)", num_zeros, name, (num_zeros / data.size) * 100 + "There are %s 0s in %s (%s percent)", num_zeros, name, (num_zeros / data.size) * 100 ) zero_in_both = np.stack([row_targets == 0, col_targets == 0], axis=1).all(axis=1).sum() @@ -498,7 +499,7 @@ def calibrate( self, distributions: MultiCostDistribution, running_log_path: Path, - gm_params: GMCalibParams, + gm_params: GMCalibParams, verbose: int = 0, **kwargs, ) -> dict[str | int, GravityModelCalibrateResults]: @@ -518,7 +519,7 @@ def calibrate( running_log_path: os.PathLike, path to a csv to log the model iterations and results gm_params: GMCalibParams - defines the detailed parameters, see `GMCalibParams` documentation for more info + defines the detailed parameters, see `GMCalibParams` documentation for more info *args, **kwargs, Returns @@ -533,7 +534,7 @@ def calibrate( `scipy.optimize.least_squares()` `caf.distribute.gravity_model.multi_area.GMCalibParams` """ - + self._validate_running_log(running_log_path) self._initialise_internal_params() @@ -608,7 +609,9 @@ def calibrate( best_convergence = self.achieved_convergence best_params = result.x - if (not all(self.achieved_convergence) >= gm_params.failure_tol) and gm_params.default_retry: + if ( + not all(self.achieved_convergence) >= gm_params.failure_tol + ) and gm_params.default_retry: LOG.info( "%sachieved a convergence of %s, " "however the failure tolerance is set to %s. Trying again with " From 4f2e67234938b0fb55e2c6eb15db6cac77f19eac Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:25:39 +0000 Subject: [PATCH 31/50] docs for new calib params class and a lil tidy up --- .../distribute/gravity_model/multi_area.py | 70 +++++++++++++++++-- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 351881b..4d2c062 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -24,7 +24,6 @@ # # # CONSTANTS # # # LOG = logging.getLogger(__name__) -DEFAULT_FURNESS_TOL = 1e-6 # pylint:disable=duplicate-code @@ -96,11 +95,69 @@ class MultiDistInput(BaseConfig): @dataclass class GMCalibParams: + """Parameters required for the multi tld gravity mode calibrate method + + All of the arguements have defaults, i.e. you can create the default object with + no arguements. HOWEVER, read the parameter section below, it is important to + understand the impact and implications of the parameters you use. If they don't make + sense, go pester your nearest Demand Modelling expert. + + Parameters + ---------- + furness_jac: bool, optional + Whether to Furness within the Jacobian function. Not furnessing within + the Jacobian does not represent knock on effects to other areas of + altering parameters for a given area. If you expect these effects to be + significant this should be set to True, but otherwise the process runs + quicker with it set to False. Default False. + + diff_step: float, optional + Copied from scipy.optimize.least_squares documentation, where it + is passed to: + Determines the relative step size for the finite difference + approximation of the Jacobian. The actual step is computed as + `x * diff_step`. If None (default), then diff_step is taken to be a + conventional “optimal” power of machine epsilon for the finite + difference scheme used, default 1e-8 + + ftol: float, optional + The tolerance to pass to `scipy.optimize.least_squares`. The search + will stop once this tolerance has been met. This is the + tolerance for termination by the change of the cost function, default 1e-4 + + xtol: float, optional + The tolerance to pass to `scipy.optimize.least_squares`. The search + will stop once this tolerance has been met. This is the + tolerance for termination by the change of the independent + variables. Default 1e-4 + + furness_tol: float, optional + Target Root Mean Square Error that is aimed for with each furness iteration, + once condition is met furness with terminate, returning that iterations results. + Default 1e-6 + + grav_max_iters: int, optional + The maximum number of calibration iterations to complete before + termination if the ftol has not been met. Default 100 + + failure_tol: float, optional + If, after initial calibration using `init_params`, the achieved + convergence is less than this value, calibration will be run again with + the default parameters from `self.cost_function`. Default 0 + + default_retry: bool, optional: + If, after running with `init_params`, the achieved convergence + is less than `failure_tol`, calibration will be run again with the + default parameters of `self.cost_function`. + This argument is ignored if the default parameters are given + as `init_params. Default True + """ + furness_jac: bool = False diff_step: float = 1e-8 ftol: float = 1e-4 xtol: float = 1e-4 - furness_tol: float = DEFAULT_FURNESS_TOL + furness_tol: float = 1e-6 grav_max_iters: int = 100 failure_tol: float = 0 default_retry: bool = True @@ -430,7 +487,10 @@ def __init__( num_zeros = (data == 0).sum() # casting bool as 1, 0 LOG.info( - "There are %s 0s in %s (%s percent)", num_zeros, name, (num_zeros / data.size) * 100 + "There are %s 0s in %s (%s percent)", + num_zeros, + name, + (num_zeros / data.size) * 100, ) zero_in_both = np.stack([row_targets == 0, col_targets == 0], axis=1).all(axis=1).sum() @@ -792,7 +852,7 @@ def run( self, distributions: MultiCostDistribution, running_log_path: Path, - furness_tol: float = DEFAULT_FURNESS_TOL, + furness_tol: float = 1e-6, ) -> dict[int | str, GravityModelCalibrateResults]: """ Run the gravity_model without calibrating. @@ -808,7 +868,7 @@ def run( Csv path to log results and info furness_tol : float, optional tolerance for difference in target and achieved value, - at which to stop furnessing, by default DEFAULT_FURNESS_TOL + at which to stop furnessing, by default 1e-6 """ params_len = len(distributions[0].function_params) From 7648781ec237e45995f4860811620d38f4c6a968 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:33:29 +0000 Subject: [PATCH 32/50] updated tests to new API --- src/caf/distribute/gravity_model/__init__.py | 2 +- tests/gravity_model/test_multi_area.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/caf/distribute/gravity_model/__init__.py b/src/caf/distribute/gravity_model/__init__.py index 8a2f4bd..fb7bf1c 100644 --- a/src/caf/distribute/gravity_model/__init__.py +++ b/src/caf/distribute/gravity_model/__init__.py @@ -10,4 +10,4 @@ # Models from caf.distribute.gravity_model.single_area import SingleAreaGravityModelCalibrator -from caf.distribute.gravity_model.multi_area import MultiAreaGravityModelCalibrator +from caf.distribute.gravity_model.multi_area import MultiAreaGravityModelCalibrator, GMCalibParams diff --git a/tests/gravity_model/test_multi_area.py b/tests/gravity_model/test_multi_area.py index c649ebb..a0a24f7 100644 --- a/tests/gravity_model/test_multi_area.py +++ b/tests/gravity_model/test_multi_area.py @@ -184,7 +184,7 @@ def fixture_cal_no_furness(data_dir, infilled, multi_tld, trip_ends, mock_dir): cost_function=cost_functions.BuiltInCostFunction.LOG_NORMAL.get_cost_function(), ) results = model.calibrate( - multi_tld, running_log_path=mock_dir / "temp_log.csv", furness_jac=False + multi_tld, running_log_path=mock_dir / "temp_log.csv", gm_params=gm.GMCalibParams() ) return results @@ -207,7 +207,7 @@ def fixture_cal_furness(self, data_dir, infilled, multi_tld, trip_ends, mock_dir cost_matrix=infilled, cost_function=cost_functions.BuiltInCostFunction.LOG_NORMAL.get_cost_function(), ) - results = model.calibrate(multi_tld, running_log_path=mock_dir / "temp_log.csv") + results = model.calibrate(multi_tld, running_log_path=mock_dir / "temp_log.csv", gm_params=gm.GMCalibParams(furness_jac=True)) return results @pytest.mark.parametrize("area", ["City", "Town", "External", "Village"]) From cd9c3e91b4e9e58bbcf1a61f65fcade5aba7972e Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:44:24 +0000 Subject: [PATCH 33/50] ran black. some linting fixes --- src/caf/distribute/gravity_model/__init__.py | 5 ++++- src/caf/distribute/gravity_model/core.py | 2 +- src/caf/distribute/gravity_model/multi_area.py | 4 ++-- tests/gravity_model/test_multi_area.py | 6 +++++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/caf/distribute/gravity_model/__init__.py b/src/caf/distribute/gravity_model/__init__.py index fb7bf1c..cc72060 100644 --- a/src/caf/distribute/gravity_model/__init__.py +++ b/src/caf/distribute/gravity_model/__init__.py @@ -10,4 +10,7 @@ # Models from caf.distribute.gravity_model.single_area import SingleAreaGravityModelCalibrator -from caf.distribute.gravity_model.multi_area import MultiAreaGravityModelCalibrator, GMCalibParams +from caf.distribute.gravity_model.multi_area import ( + MultiAreaGravityModelCalibrator, + GMCalibParams, +) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 74161ff..ad6076b 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -90,7 +90,7 @@ class GravityModelCalibrateResults(GravityModelResults): # Targets target_cost_distribution: cost_utils.CostDistribution cost_function: cost_functions.CostFunction - cost_params: dict[str, Any] + cost_params: dict[str|int, Any] def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: """Plot a comparison of the achieved and target distributions. diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 4d2c062..4273bcc 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -783,8 +783,8 @@ def _jacobian_function( def _gravity_function( self, - init_params: dict[str, float], - cost_distributions: MGMCostDistribution, + init_params: dict[str|int, float], + cost_distributions: MultiCostDistribution, furness_tol: float, running_log_path: os.PathLike, params_len: int, diff --git a/tests/gravity_model/test_multi_area.py b/tests/gravity_model/test_multi_area.py index a0a24f7..7cb2b25 100644 --- a/tests/gravity_model/test_multi_area.py +++ b/tests/gravity_model/test_multi_area.py @@ -207,7 +207,11 @@ def fixture_cal_furness(self, data_dir, infilled, multi_tld, trip_ends, mock_dir cost_matrix=infilled, cost_function=cost_functions.BuiltInCostFunction.LOG_NORMAL.get_cost_function(), ) - results = model.calibrate(multi_tld, running_log_path=mock_dir / "temp_log.csv", gm_params=gm.GMCalibParams(furness_jac=True)) + results = model.calibrate( + multi_tld, + running_log_path=mock_dir / "temp_log.csv", + gm_params=gm.GMCalibParams(furness_jac=True), + ) return results @pytest.mark.parametrize("area", ["City", "Town", "External", "Village"]) From ebf4001437eada6e19284db33a0085b61011fb1e Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:46:31 +0000 Subject: [PATCH 34/50] ran black (again) --- src/caf/distribute/gravity_model/core.py | 2 +- src/caf/distribute/gravity_model/multi_area.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index ad6076b..a265af1 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -90,7 +90,7 @@ class GravityModelCalibrateResults(GravityModelResults): # Targets target_cost_distribution: cost_utils.CostDistribution cost_function: cost_functions.CostFunction - cost_params: dict[str|int, Any] + cost_params: dict[str | int, Any] def plot_distributions(self, truncate_last_bin: bool = False) -> figure.Figure: """Plot a comparison of the achieved and target distributions. diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 4273bcc..e31ac82 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -783,7 +783,7 @@ def _jacobian_function( def _gravity_function( self, - init_params: dict[str|int, float], + init_params: dict[str | int, float], cost_distributions: MultiCostDistribution, furness_tol: float, running_log_path: os.PathLike, From 95a0ac0917466180db890f91616a86613d3e406a Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:53:21 +0000 Subject: [PATCH 35/50] set python to v3.10 attempt to fix tests on github --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 58eea2e..e28dc45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ matplotlib>=3.8.2 # Ipf requirements sparse>=0.13.0 numba>=0.60.0 + +python>=3.10 From 6911e52fa9c3f5ffc5776b87bb3cb21dddb65e4e Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 11:55:48 +0000 Subject: [PATCH 36/50] undo the 3.10 requirements. stupid fix added the annotations import --- requirements.txt | 1 - tests/gravity_model/test_multi_area.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e28dc45..2ac407e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ matplotlib>=3.8.2 sparse>=0.13.0 numba>=0.60.0 -python>=3.10 diff --git a/tests/gravity_model/test_multi_area.py b/tests/gravity_model/test_multi_area.py index 7cb2b25..35f335a 100644 --- a/tests/gravity_model/test_multi_area.py +++ b/tests/gravity_model/test_multi_area.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # Built-Ins from pathlib import Path From 7506f58d5d7923836b9de6b1693753af0e02ff21 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 12:09:09 +0000 Subject: [PATCH 37/50] added more annotation imports --- src/caf/distribute/gravity_model/core.py | 1 + src/caf/distribute/gravity_model/single_area.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index a265af1..6229671 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Core abstract functionality for gravity model classes to build on.""" +from __future__ import annotations # Built-Ins import abc import dataclasses diff --git a/src/caf/distribute/gravity_model/single_area.py b/src/caf/distribute/gravity_model/single_area.py index c6115e5..bd020b1 100644 --- a/src/caf/distribute/gravity_model/single_area.py +++ b/src/caf/distribute/gravity_model/single_area.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Implementation of a self-calibrating single area gravity model.""" +from __future__ import annotations # Built-Ins import functools import logging From bca6aa0f66d2884c025ae3270d441ecad6b2d4ce Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 12:54:15 +0000 Subject: [PATCH 38/50] ran isort --- src/caf/distribute/gravity_model/core.py | 1 + src/caf/distribute/gravity_model/single_area.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/caf/distribute/gravity_model/core.py b/src/caf/distribute/gravity_model/core.py index 6229671..1e83037 100644 --- a/src/caf/distribute/gravity_model/core.py +++ b/src/caf/distribute/gravity_model/core.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Core abstract functionality for gravity model classes to build on.""" from __future__ import annotations + # Built-Ins import abc import dataclasses diff --git a/src/caf/distribute/gravity_model/single_area.py b/src/caf/distribute/gravity_model/single_area.py index bd020b1..9e51d0d 100644 --- a/src/caf/distribute/gravity_model/single_area.py +++ b/src/caf/distribute/gravity_model/single_area.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Implementation of a self-calibrating single area gravity model.""" from __future__ import annotations + # Built-Ins import functools import logging From 2a2d794105acdc3892ca462f26bd7ebcc0f069be Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 13:45:35 +0000 Subject: [PATCH 39/50] linting error fixes --- requirements_dev.txt | 2 +- .../distribute/gravity_model/multi_area.py | 37 +++++-------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 74ceb45..8ed278e 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -4,7 +4,7 @@ isort>=5.12.0 mypy>=1.0.0 mypy_extensions>=1.0.0 pydocstyle[toml]>=6.1.1 -pylint>=2.14.5 +pylint==3.2 # Testing pytest>=7.4.0 diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index e31ac82..bd618b8 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -504,28 +504,6 @@ def __init__( if len(col_targets) != cost_matrix.shape[1]: raise IndexError("col_targets doesn't match cost_matrix") - def process_tlds(self): - """Get distributions in the right format for a multi-area gravity model.""" - dists = [] - for cat in self.tlds.index.unique(): - tld = self.tlds.loc[cat] - tld = cost_utils.CostDistribution(tld) - zones = self.lookup[self.lookup["cat"] == cat].index.values - if len(zones) == 0: - raise ValueError( - f"{cat} doesn't seem to appear in the given tld " - "lookup. Check for any typos (e.g. lower/upper case). " - f"If this is expected, remove {cat} from your " - "tlds dataframe before inputting." - ) - - distribution = MultiCostDistribution( - name=cat, cost_distribution=tld, zones=zones, function_params=self.init_params - ) - dists.append(distribution) - - return dists - def _calculate_perceived_factors( self, target_cost_distribution: cost_utils.CostDistribution, @@ -624,12 +602,15 @@ def calibrate( ) if min_cost < min_binning: warnings.warn( - f"the min cost in the cost matrix for" - f" category {dist.name}, was {min_cost}, " - "whereas the lowest bin edge in cost" - f" distribution was {min_binning}, " - "you will not be fitting to trips" - " with a cost less than the binning" + "the min cost in the cost matrix for" + " category %s, was %s," + " whereas the lowest bin edge in cost" + " distribution was %s, " + " you will not be fitting to trips" + " with a cost less than the binning", + dist.name, + min_cost, + min_binning, ) gravity_kwargs: dict[str, Any] = { From 7739a0034a9976f9faff8833a52c74c399a9d619 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 13:47:54 +0000 Subject: [PATCH 40/50] updated pylint version pylint --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9bd9164..14213fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ lint = [ "mypy>=1.0.0", "mypy_extensions>=1.0.0", "pydocstyle[toml]>=6.1.1", - "pylint>=2.14.5", + "pylint>=3.2", ] test = [ From d2c100ee7c2f6efeb33b8f08eea92441b0ea8905 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 13:57:34 +0000 Subject: [PATCH 41/50] MORE LINTING FIXES!!!!!! --- .../distribute/gravity_model/multi_area.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index bd618b8..59fb062 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -3,6 +3,7 @@ from __future__ import annotations # Built-Ins +import copy import functools import logging import os @@ -304,8 +305,15 @@ def __getitem__(self, x) -> MGMCostDistribution: def __len__(self) -> int: return len(self.distributions) + @property def copy(self) -> MultiCostDistribution: - return MultiCostDistribution(self.distributions) + """ + Returns + ------- + MultiCostDistribution + Deep copy of the object + """ + return copy.deepcopy(self) @dataclass @@ -462,7 +470,7 @@ def __init__( col_targets: np.ndarray, cost_matrix: np.ndarray, cost_function: cost_functions.CostFunction, - # TODO move these parameters as inputs of calibrate and run + # TODO(kf) move these parameters as inputs of calibrate and run ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) @@ -592,13 +600,16 @@ def calibrate( min_cost = self.cost_matrix[dist.zones, :].min() if max_cost > max_binning: - LOG.warning( - f"the maximum cost in the cost matrix for" - f" category {dist.name}, was {max_cost}, " + warnings.warn( + "the maximum cost in the cost matrix for" + " category %s, was %s, " "whereas the highest bin edge in cost" - f" distribution was {max_binning}, " + " distribution was %s, " "you will not be fitting to trips" - " with a cost greater than the binning" + " with a cost greater than the binning", + dist.name, + max_cost, + max_binning, ) if min_cost < min_binning: warnings.warn( From fd84360649898191cbe27feda7364cca34bb956f Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 14:07:00 +0000 Subject: [PATCH 42/50] MOOOOOOORE PYLINT FIXES!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! --- src/caf/distribute/gravity_model/multi_area.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 59fb062..40938d6 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -176,6 +176,7 @@ class MultiCostDistribution: distributions: list[MGMCostDistribution] + # pylint: disable=Too many arguments @classmethod def from_pandas( cls, @@ -351,7 +352,7 @@ class MGMCostDistribution: # TODO(kf) validate params # TODO(kf) validate cost distributions - + # pylint: disable=too-many-arguments, too-many-locals @classmethod def from_pandas( cls, From 6e2dc71a3b37b8437fda7bc35774c857acc41310 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 14:24:10 +0000 Subject: [PATCH 43/50] pylint fixes (please work) and warnings bug fix --- .../distribute/gravity_model/multi_area.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 40938d6..03d41f1 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -176,7 +176,7 @@ class MultiCostDistribution: distributions: list[MGMCostDistribution] - # pylint: disable=Too many arguments + @classmethod def from_pandas( cls, @@ -237,6 +237,7 @@ def from_pandas( -------- `validate` """ + # pylint: disable=too-many-arguments distributions: list[MGMCostDistribution] = [] @@ -352,7 +353,6 @@ class MGMCostDistribution: # TODO(kf) validate params # TODO(kf) validate cost distributions - # pylint: disable=too-many-arguments, too-many-locals @classmethod def from_pandas( cls, @@ -411,6 +411,8 @@ def from_pandas( ValueError if zones in `cat_zone_correspondence` are not present in `ordered_zones` """ + # pylint: disable=too-many-arguments, too-many-locals + # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col @@ -603,26 +605,20 @@ def calibrate( if max_cost > max_binning: warnings.warn( "the maximum cost in the cost matrix for" - " category %s, was %s, " + f" category {dist.name}, was {max_cost}, " "whereas the highest bin edge in cost" - " distribution was %s, " + f" distribution was {max_binning}, " "you will not be fitting to trips" - " with a cost greater than the binning", - dist.name, - max_cost, - max_binning, + " with a cost greater than the binning" ) if min_cost < min_binning: warnings.warn( "the min cost in the cost matrix for" - " category %s, was %s," + f" category {dist.name}, was {min_cost}," " whereas the lowest bin edge in cost" - " distribution was %s, " + f" distribution was {min_binning}, " " you will not be fitting to trips" - " with a cost less than the binning", - dist.name, - min_cost, - min_binning, + " with a cost less than the binning" ) gravity_kwargs: dict[str, Any] = { From f847b314cdd37f5d9de3b89c05f4edfb5765ab1b Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Tue, 29 Oct 2024 14:24:29 +0000 Subject: [PATCH 44/50] ran black --- src/caf/distribute/gravity_model/multi_area.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 03d41f1..c796971 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -176,7 +176,6 @@ class MultiCostDistribution: distributions: list[MGMCostDistribution] - @classmethod def from_pandas( cls, @@ -412,7 +411,7 @@ def from_pandas( if zones in `cat_zone_correspondence` are not present in `ordered_zones` """ # pylint: disable=too-many-arguments, too-many-locals - + # get a list of zones that use this category of TLD cat_zones = cat_zone_correspondence.loc[ cat_zone_correspondence[lookup_cat_col] == category, lookup_zone_col From 5fa2e37b61e5c279a74cd364bb316c995f4f8ce5 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 30 Oct 2024 09:17:50 +0000 Subject: [PATCH 45/50] mypy fixes --- .../distribute/gravity_model/multi_area.py | 12 ++++--- tests/gravity_model/test_single_area.py | 35 ++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index c796971..1e1fbe9 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -306,7 +306,7 @@ def __getitem__(self, x) -> MGMCostDistribution: def __len__(self) -> int: return len(self.distributions) - @property + def copy(self) -> MultiCostDistribution: """ Returns @@ -476,6 +476,10 @@ def __init__( ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix) + # This is to stop MyPy moaning + self.achieved_distribution: np.ndarray + self._loop_start_time: float + if row_targets.sum() != col_targets.sum(): warnings.warn( "row and column target totals do not match. This is likely to cause Furnessing to fail." @@ -771,7 +775,7 @@ def _jacobian_function( def _gravity_function( self, - init_params: dict[str | int, float], + init_params: list[float], cost_distributions: MultiCostDistribution, furness_tol: float, running_log_path: os.PathLike, @@ -828,7 +832,7 @@ def _gravity_function( self._loop_start_time = timing.current_milli_time() self.achieved_cost_dist: list[cost_utils.CostDistribution] = distributions - self.achieved_convergence: dict[str, float] = convergences + self.achieved_convergence: dict[str|int, float] = convergences self.achieved_distribution = matrix achieved_residuals = np.concatenate(residuals) @@ -894,7 +898,7 @@ def run( def gravity_model( row_targets: pd.Series, col_targets: np.ndarray, - cost_distributions: list[MultiCostDistribution], + cost_distributions: MultiCostDistribution, cost_function: cost_functions.CostFunction, cost_mat: pd.DataFrame, furness_max_iters: int, diff --git a/tests/gravity_model/test_single_area.py b/tests/gravity_model/test_single_area.py index ec0595b..6f15427 100644 --- a/tests/gravity_model/test_single_area.py +++ b/tests/gravity_model/test_single_area.py @@ -20,6 +20,7 @@ from caf.distribute import cost_functions from caf.distribute.gravity_model import ( GravityModelCalibrateResults, + GravityModelResults, GravityModelRunResults, SingleAreaGravityModelCalibrator, ) @@ -359,66 +360,84 @@ def simple_gm_calib_results(tmp_path, cost_function) -> GMCalibrateResults: """Load in the small_and_simple test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "small_and_simple" - return GMCalibrateResults.from_file( + results = GMCalibrateResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMCalibrateResults) + return results def real_gm_calib_results(tmp_path, cost_function) -> GMCalibrateResults: """Load in the real world test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "realistic" - return GMCalibrateResults.from_file( + results = GMCalibrateResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMCalibrateResults) + return results -def real_gm_calib_perceived_results(tmp_path, cost_function) -> GMCalibrateResults: +def real_gm_calib_perceived_results(tmp_path, cost_function) -> GMCalibratePerceivedResults: """Load in the real world test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "realistic" - return GMCalibratePerceivedResults.from_file( + results = GMCalibratePerceivedResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMCalibratePerceivedResults) + return results def simple_gm_run_results(tmp_path, cost_function) -> GMRunResults: """Load in the small_and_simple test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "small_and_simple" - return GMRunResults.from_file( + results = GMRunResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMRunResults) + return results def real_gm_run_results(tmp_path, cost_function) -> GMRunResults: """Load in the real world test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "realistic" - return GMRunResults.from_file( + results = GMRunResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMRunResults) + return results -def real_gm_run_perceived_results(tmp_path, cost_function) -> GMRunResults: +def real_gm_run_perceived_results(tmp_path, cost_function) -> GMRunPerceivedResults: """Load in the real world test""" running_log_path = tmp_path / "run_log.csv" data_path = TEST_DATA_PATH / "realistic" - return GMRunPerceivedResults.from_file( + results = GMRunPerceivedResults.from_file( path=data_path, running_log_path=running_log_path, cost_function=cost_function, ) + # this is to stop MyPy moaning + assert isinstance(results, GMRunPerceivedResults) + return results @pytest.fixture(name="simple_log_normal_calib") From 269942c8ffafe56d4ca7086bfa8d77e4fc172ecc Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 30 Oct 2024 10:12:01 +0000 Subject: [PATCH 46/50] ran black PyDocStyle fixes --- .../distribute/gravity_model/multi_area.py | 48 +++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 1e1fbe9..26bad53 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -96,7 +96,7 @@ class MultiDistInput(BaseConfig): @dataclass class GMCalibParams: - """Parameters required for the multi tld gravity mode calibrate method + """Parameters required for the multi tld gravity mode calibrate method. All of the arguements have defaults, i.e. you can create the default object with no arguements. HOWEVER, read the parameter section below, it is important to @@ -166,7 +166,7 @@ class GMCalibParams: @dataclass class MultiCostDistribution: - """Cost distributions to be used for the multi-cost distribution gravity model + """Cost distributions to be used for the multi-cost distribution gravity model. Parameters ---------- @@ -192,7 +192,7 @@ def from_pandas( lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", ) -> MultiCostDistribution: - """constructor using pandas dataframes + """Build class using pandas dataframes. Parameters ---------- @@ -266,7 +266,10 @@ def from_pandas( @classmethod def validate(cls, distributions: list[MGMCostDistribution]): - """Validates the distributions passed + """Checks the distributions passed. + + Raises an error if duplicate zones are found across different + distributions. Parameters ---------- @@ -298,17 +301,43 @@ def validate(cls, distributions: list[MGMCostDistribution]): raise ValueError("duplicate found in the distribution zone definition") def __iter__(self) -> Iterator[MGMCostDistribution]: + """Iterates through each distribution. + + Yields + ------ + Iterator[MGMCostDistribution] + iterator for the cost distributions. + """ yield from self.distributions - def __getitem__(self, x) -> MGMCostDistribution: + def __getitem__(self, x: int) -> MGMCostDistribution: + """Retrieves the xth distribution. + + Parameters + ---------- + x : int + index of the distribution to retreive + + Returns + ------- + MGMCostDistribution + the xth distrubtion. + """ return self.distributions[x] def __len__(self) -> int: - return len(self.distributions) + """The number of distrubtions. + Returns + ------- + int + The number of distrubtions. + """ + return len(self.distributions) def copy(self) -> MultiCostDistribution: - """ + """A wrapper around deepcopy. + Returns ------- MultiCostDistribution @@ -369,7 +398,7 @@ def from_pandas( lookup_cat_col: str = "category", lookup_zone_col: str = "zone_id", ) -> MGMCostDistribution: - """constructor that uses pandas dataframes and series + """Build using pandas dataframes and series. Parameters ---------- @@ -574,6 +603,7 @@ def calibrate( defines the detailed parameters, see `GMCalibParams` documentation for more info *args, **kwargs, + Returns ------- dict[str | int, GravityModelCalibrateResults]: @@ -832,7 +862,7 @@ def _gravity_function( self._loop_start_time = timing.current_milli_time() self.achieved_cost_dist: list[cost_utils.CostDistribution] = distributions - self.achieved_convergence: dict[str|int, float] = convergences + self.achieved_convergence: dict[str | int, float] = convergences self.achieved_distribution = matrix achieved_residuals = np.concatenate(residuals) From 9b64059f41d54611137ba074d9ea351cb22df4de Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 30 Oct 2024 10:16:39 +0000 Subject: [PATCH 47/50] fixing Imperative moods -_- --- src/caf/distribute/gravity_model/multi_area.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 26bad53..a166284 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -266,7 +266,7 @@ def from_pandas( @classmethod def validate(cls, distributions: list[MGMCostDistribution]): - """Checks the distributions passed. + """Check the distributions passed. Raises an error if duplicate zones are found across different distributions. @@ -301,7 +301,7 @@ def validate(cls, distributions: list[MGMCostDistribution]): raise ValueError("duplicate found in the distribution zone definition") def __iter__(self) -> Iterator[MGMCostDistribution]: - """Iterates through each distribution. + """Iterate through each distribution. Yields ------ @@ -311,7 +311,7 @@ def __iter__(self) -> Iterator[MGMCostDistribution]: yield from self.distributions def __getitem__(self, x: int) -> MGMCostDistribution: - """Retrieves the xth distribution. + """Retrieve the xth distribution. Parameters ---------- @@ -326,7 +326,7 @@ def __getitem__(self, x: int) -> MGMCostDistribution: return self.distributions[x] def __len__(self) -> int: - """The number of distrubtions. + """Number of distrubtions. Returns ------- @@ -336,7 +336,7 @@ def __len__(self) -> int: return len(self.distributions) def copy(self) -> MultiCostDistribution: - """A wrapper around deepcopy. + """Wrapper around deepcopy. Returns ------- From 6f43a74177b5ff3fecc3c05c4ec27d0827f29969 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 30 Oct 2024 10:26:26 +0000 Subject: [PATCH 48/50] I dont know what an imperative mood is but I dont like them --- src/caf/distribute/gravity_model/multi_area.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index a166284..198366d 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -326,7 +326,7 @@ def __getitem__(self, x: int) -> MGMCostDistribution: return self.distributions[x] def __len__(self) -> int: - """Number of distrubtions. + """ Get the number of distrubtions. Returns ------- @@ -336,7 +336,7 @@ def __len__(self) -> int: return len(self.distributions) def copy(self) -> MultiCostDistribution: - """Wrapper around deepcopy. + """ Get a copy of the object. Returns ------- From a6bd79d1989252bb60e4eccf30beed568e9eb27b Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Wed, 30 Oct 2024 10:26:50 +0000 Subject: [PATCH 49/50] ran black --- src/caf/distribute/gravity_model/multi_area.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index 198366d..ccd4d17 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -326,7 +326,7 @@ def __getitem__(self, x: int) -> MGMCostDistribution: return self.distributions[x] def __len__(self) -> int: - """ Get the number of distrubtions. + """Get the number of distrubtions. Returns ------- @@ -336,7 +336,7 @@ def __len__(self) -> int: return len(self.distributions) def copy(self) -> MultiCostDistribution: - """ Get a copy of the object. + """Get a copy of the object. Returns ------- From d7c1d36c9c017bd2db1887ab59a1339db87c0e92 Mon Sep 17 00:00:00 2001 From: Kieran-Fishwick-TfN Date: Thu, 31 Oct 2024 16:00:39 +0000 Subject: [PATCH 50/50] removed redundant TODO --- src/caf/distribute/gravity_model/multi_area.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/caf/distribute/gravity_model/multi_area.py b/src/caf/distribute/gravity_model/multi_area.py index ccd4d17..36ff298 100644 --- a/src/caf/distribute/gravity_model/multi_area.py +++ b/src/caf/distribute/gravity_model/multi_area.py @@ -501,7 +501,6 @@ def __init__( col_targets: np.ndarray, cost_matrix: np.ndarray, cost_function: cost_functions.CostFunction, - # TODO(kf) move these parameters as inputs of calibrate and run ): super().__init__(cost_function=cost_function, cost_matrix=cost_matrix)