From 309d07558936369abec5dc5eab70bc322113c7d3 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 12 Jan 2024 12:25:09 +0200 Subject: [PATCH 1/5] Add raster reading utility function --- eis_toolkit/utilities/file_io.py | 70 ++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/eis_toolkit/utilities/file_io.py b/eis_toolkit/utilities/file_io.py index 113d662b..cebc7388 100644 --- a/eis_toolkit/utilities/file_io.py +++ b/eis_toolkit/utilities/file_io.py @@ -1,12 +1,14 @@ from pathlib import Path -from typing import Union import geopandas as gpd +import numpy as np import pandas as pd import rasterio from beartype import beartype +from beartype.typing import Literal, Sequence, Tuple, Union -from eis_toolkit.exceptions import FileReadError +from eis_toolkit import exceptions +from eis_toolkit.utilities.checks.raster import check_raster_grids @beartype @@ -29,7 +31,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr # Try to read as a raster first try: data = read_raster(file_path) - except FileReadError: + except exceptions.FileReadError: # Try to read as a GeoDataFrame try: @@ -41,7 +43,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr data = pd.read_csv(file_path) except pd.errors.ParserError: # If none of the readers succeeded, raise an exception - raise FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe") + raise exceptions.FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe") return data @@ -62,10 +64,64 @@ def read_raster(file_path: Path) -> rasterio.io.DatasetReader: try: data = rasterio.open(file_path) except rasterio.errors.RasterioIOError: - raise FileReadError(f"Failed to read raster data from {file_path}.") + raise exceptions.FileReadError(f"Failed to read raster data from {file_path}.") return data +def read_and_stack_rasters( + raster_files: Sequence[Path], + nodata_handling: Literal["convert_to_nan", "unify", "raise_exception", "none"] = "convert_to_nan", +) -> Tuple[np.ndarray, Sequence[rasterio.profiles.Profile]]: + """ + Read multiple raster files and stack all their bands into a single 3D array. + + Checks that all rasters have the same grid properties. If there are any differences, exception is raised. + + Args: + raster_files: List of paths to raster files. + nodata_handling: How to handle raster nodata. convert_to_nan to changes all nodata to np.nan, unify + changes all rasters to use -9999 as their nodata value, raise_exception raises an exception if + all rasters do not have the same nodata value, and none does not do anything for nodata. + + Returns: + 3D array with shape (total bands, height, width). + + Raises: + NonMatchingRasterMetadataException: If input rasters do not have same grid properties or nodata_handling + is set to raise exception and mismatching nodata is encountered. + """ + bands = [] + nodata_values = [] + profiles = [] + + for raster_file in raster_files: + with rasterio.open(raster_file) as raster: + # Read all bands from each raster + profile = raster.profile + for i in range(1, raster.count + 1): + band_data = raster.read(i) + + if nodata_handling == "convert_to_nan": + band_data[band_data == profile["nodata"]] = np.nan + elif nodata_handling == "unify": + band_data[band_data == profile["nodata"]] = -9999 + profile["nodata"] = -9999 + elif nodata_handling == "raise_exception": + nodata_values.append(profile["nodata"]) + if len(set(nodata_values)) > 1: + raise exceptions.NonMatchingRasterMetadataException("Input rasters have varying nodata values.") + elif nodata_handling == "none": + pass + bands.append(band_data) + profiles.append(profile) + + if not check_raster_grids(profiles, same_extent=True): + raise exceptions.NonMatchingRasterMetadataException("Input rasters should have the same properties.") + + # Stack all bands into a single 3D array + return np.stack(bands, axis=0), profiles + + @beartype def read_vector(file_path: Path) -> gpd.GeoDataFrame: """Read a vector file to a GeoDataFrame. @@ -82,7 +138,7 @@ def read_vector(file_path: Path) -> gpd.GeoDataFrame: try: data = gpd.read_file(file_path) except (ValueError, OSError): - raise FileReadError(f"Failed to read vector data from {file_path}.") + raise exceptions.FileReadError(f"Failed to read vector data from {file_path}.") return data @@ -102,5 +158,5 @@ def read_tabular(file_path: Path) -> pd.DataFrame: try: data = pd.read_csv(file_path) except pd.errors.ParserError: - raise FileReadError(f"Failed to read tabular data from {file_path}.") + raise exceptions.FileReadError(f"Failed to read tabular data from {file_path}.") return data From 6ee8a5d628325aa7a045a23890edb512f48f5ecd Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 12 Jan 2024 12:25:46 +0200 Subject: [PATCH 2/5] Add new exception for raster metadata mismatches --- eis_toolkit/exceptions.py | 4 ++++ eis_toolkit/raster_processing/unique_combinations.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/eis_toolkit/exceptions.py b/eis_toolkit/exceptions.py index c7e42e1c..bb665a8b 100644 --- a/eis_toolkit/exceptions.py +++ b/eis_toolkit/exceptions.py @@ -74,6 +74,10 @@ class NonMatchingCrsException(Exception): """Exception error class for CRS mismatches.""" +class NonMatchingRasterMetadataException(Exception): + """Exception error class for raster metadata mismatches.""" + + class NonMatchingParameterLengthsException(Exception): """Exception error class for parameters with different lenghts.""" diff --git a/eis_toolkit/raster_processing/unique_combinations.py b/eis_toolkit/raster_processing/unique_combinations.py index 58a1af80..357b312b 100644 --- a/eis_toolkit/raster_processing/unique_combinations.py +++ b/eis_toolkit/raster_processing/unique_combinations.py @@ -3,7 +3,7 @@ from beartype import beartype from beartype.typing import Sequence, Tuple -from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exceptions import InvalidParameterValueException, NonMatchingRasterMetadataException from eis_toolkit.utilities.checks.raster import check_raster_grids @@ -37,8 +37,12 @@ def unique_combinations( # type: ignore[no-any-unimported] raster_list: Rasters to be used for finding combinations. Returns: - out_image: Combinations of rasters. - out_meta: The metadata of the first raster in raster_list. + Combinations of rasters. + The metadata of the first raster in raster_list. + + Raises: + InvalidParameterValueException: Total number of raster bands is 1. + NonMatchingRasterMetadataException: Input raster grids do not have the grid properties. """ bands = [] out_meta = raster_list[0].meta @@ -54,7 +58,7 @@ def unique_combinations( # type: ignore[no-any-unimported] raise InvalidParameterValueException("Expected to have more bands than 1") if check_raster_grids(raster_profiles) is not True: - raise InvalidParameterValueException("Expected raster grids to be of same shape") + raise NonMatchingRasterMetadataException("Expected raster grids to be have the same grid properties.") out_image = _unique_combinations(bands) return out_image, out_meta From 2668cbe56ff62e1d09441925f5b4f5ad839a3848 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 12 Jan 2024 12:26:08 +0200 Subject: [PATCH 3/5] Add CLI functions for PCA raster and vector separately --- eis_toolkit/cli.py | 85 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index 33746034..87033b7f 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -10,6 +10,7 @@ from typing import List, Optional, Tuple import geopandas as gpd +import numpy as np import pandas as pd import rasterio import typer @@ -142,6 +143,13 @@ class GradientBoostingRegressorLosses(str, Enum): quantile = "quantile" +class NodataHandling(str, Enum): + """Nodata handling choices.""" + + replace = "replace" + remove = "remove" + + RESAMPLING_MAPPING = { "nearest": warp.Resampling.nearest, "bilinear": warp.Resampling.bilinear, @@ -305,25 +313,86 @@ def parallel_coordinates_cli( typer.echo("Parallel coordinates plot completed" + echo_str_end) -# PCA +# PCA FOR RASTER DATA +@app.command() +def compute_pca_raster_cli( + input_rasters: Annotated[List[Path], INPUT_FILE_OPTION], + output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + number_of_components: int = typer.Option(), + # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func + nodata_handling: NodataHandling = NodataHandling.remove, + # NOTE: Omitted nodata parameter. Should use raster nodata. +): + """Compute defined number of principal components for raster data.""" + from eis_toolkit.exploratory_analyses.pca import compute_pca + from eis_toolkit.utilities.file_io import read_and_stack_rasters + + typer.echo("Progress: 10%") + + stacked_array, profiles = read_and_stack_rasters(input_rasters, nodata_handling="convert_to_nan") + typer.echo("Progress: 25%") + + pca_array, variance_ratios = compute_pca( + data=stacked_array, number_of_components=number_of_components, nodata_handling=nodata_handling + ) + + # Fill np.nan with nodata before writing data to raster + pca_array[pca_array == np.nan] = -9999 + out_profile = profiles[0] + out_profile["nodata"] = -9999 + + # Create dictionary from the variance ratios array + variances_ratios_dict = {} + for i, variance_ratio in enumerate(variance_ratios): + name = "PC " + str(i) + " explained variance" + variances_ratios_dict[name] = variance_ratio + json_str = json.dumps(variances_ratios_dict) + + with rasterio.open(output_raster, "w", **out_profile) as dst: + dst.write(pca_array) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo(f"PCA computation (raster) completed, output raster saved to {output_raster}.") + + +# PCA FOR VECTOR DATA @app.command() -def compute_pca_cli( +def compute_pca_vector_cli( input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], + output_vector: Annotated[Path, OUTPUT_FILE_OPTION], number_of_components: int = typer.Option(), + columns: Annotated[List[str], typer.Option()] = None, + # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func + nodata_handling: NodataHandling = NodataHandling.remove, + nodata: float = None, ): - """Compute principal components for the input data.""" + """Compute defined number of principal components for vector data.""" from eis_toolkit.exploratory_analyses.pca import compute_pca typer.echo("Progress: 10%") - geodataframe = gpd.read_file(input_vector) # TODO: Check if gdf to df handling in tool itself - dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + gdf = gpd.read_file(input_vector) typer.echo("Progress: 25%") - pca_df, variance_ratios = compute_pca(data=dataframe, number_of_components=number_of_components) + pca_gdf, variance_ratios = compute_pca( + data=gdf, + number_of_components=number_of_components, + columns=columns, + nodata_handling=nodata_handling, + nodata=nodata, + ) - pca_df.to_csv(output_file) + # Create dictionary from the variance ratios array + variances_ratios_dict = {} + for i, variance_ratio in enumerate(variance_ratios): + name = "PC " + str(i) + " explained variance" + variances_ratios_dict[name] = variance_ratio + json_str = json.dumps(variances_ratios_dict) + + pca_gdf.to_file(output_vector) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo(f"PCA computation (vector) completed, output vector saved to {output_vector}.") # DESCRIPTIVE STATISTICS (RASTER) From 1ad58f5e1ce8d73a452ea3f9e2bb36e0f68cb773 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 15 Jan 2024 10:31:08 +0200 Subject: [PATCH 4/5] Fixed PCA raster CLI to update metadata correctl and change input raster option to argument --- eis_toolkit/cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index 87033b7f..8f5b2a59 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -316,7 +316,7 @@ def parallel_coordinates_cli( # PCA FOR RASTER DATA @app.command() def compute_pca_raster_cli( - input_rasters: Annotated[List[Path], INPUT_FILE_OPTION], + input_rasters: INPUT_FILES_ARGUMENT, output_raster: Annotated[Path, OUTPUT_FILE_OPTION], number_of_components: int = typer.Option(), # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func @@ -341,6 +341,9 @@ def compute_pca_raster_cli( out_profile = profiles[0] out_profile["nodata"] = -9999 + # Update nr of bands + out_profile["count"] = number_of_components + # Create dictionary from the variance ratios array variances_ratios_dict = {} for i, variance_ratio in enumerate(variance_ratios): @@ -350,6 +353,7 @@ def compute_pca_raster_cli( with rasterio.open(output_raster, "w", **out_profile) as dst: dst.write(pca_array) + typer.echo("Progress: 100%") typer.echo(f"Results: {json_str}") typer.echo(f"PCA computation (raster) completed, output raster saved to {output_raster}.") From 2af4980d68a87f05c055dd500959726c36618ec6 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 18 Jan 2024 09:42:17 +0200 Subject: [PATCH 5/5] Add empty list check for columns arg --- eis_toolkit/exploratory_analyses/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eis_toolkit/exploratory_analyses/pca.py b/eis_toolkit/exploratory_analyses/pca.py index 2cb2ff39..c1a36a86 100644 --- a/eis_toolkit/exploratory_analyses/pca.py +++ b/eis_toolkit/exploratory_analyses/pca.py @@ -145,7 +145,7 @@ def compute_pca( geometries = data.geometry crs = data.crs df = df.drop(columns=["geometry"]) - if columns is not None: + if columns is not None and columns != []: if not check_columns_valid(df, columns): raise exceptions.InvalidColumnException("All selected columns were not found in the input DataFrame.") df = df[columns]