From 309d07558936369abec5dc5eab70bc322113c7d3 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 12 Jan 2024 12:25:09 +0200
Subject: [PATCH 1/5] Add raster reading utility function

---
 eis_toolkit/utilities/file_io.py | 70 ++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/eis_toolkit/utilities/file_io.py b/eis_toolkit/utilities/file_io.py
index 113d662b..cebc7388 100644
--- a/eis_toolkit/utilities/file_io.py
+++ b/eis_toolkit/utilities/file_io.py
@@ -1,12 +1,14 @@
 from pathlib import Path
-from typing import Union
 
 import geopandas as gpd
+import numpy as np
 import pandas as pd
 import rasterio
 from beartype import beartype
+from beartype.typing import Literal, Sequence, Tuple, Union
 
-from eis_toolkit.exceptions import FileReadError
+from eis_toolkit import exceptions
+from eis_toolkit.utilities.checks.raster import check_raster_grids
 
 
 @beartype
@@ -29,7 +31,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr
     # Try to read as a raster first
     try:
         data = read_raster(file_path)
-    except FileReadError:
+    except exceptions.FileReadError:
 
         # Try to read as a GeoDataFrame
         try:
@@ -41,7 +43,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr
                 data = pd.read_csv(file_path)
             except pd.errors.ParserError:
                 # If none of the readers succeeded, raise an exception
-                raise FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe")
+                raise exceptions.FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe")
 
     return data
 
@@ -62,10 +64,64 @@ def read_raster(file_path: Path) -> rasterio.io.DatasetReader:
     try:
         data = rasterio.open(file_path)
     except rasterio.errors.RasterioIOError:
-        raise FileReadError(f"Failed to read raster data from {file_path}.")
+        raise exceptions.FileReadError(f"Failed to read raster data from {file_path}.")
     return data
 
 
+def read_and_stack_rasters(
+    raster_files: Sequence[Path],
+    nodata_handling: Literal["convert_to_nan", "unify", "raise_exception", "none"] = "convert_to_nan",
+) -> Tuple[np.ndarray, Sequence[rasterio.profiles.Profile]]:
+    """
+    Read multiple raster files and stack all their bands into a single 3D array.
+
+    Checks that all rasters have the same grid properties. If there are any differences, exception is raised.
+
+    Args:
+        raster_files: List of paths to raster files.
+        nodata_handling: How to handle raster nodata. convert_to_nan to changes all nodata to np.nan, unify
+            changes all rasters to use -9999 as their nodata value, raise_exception raises an exception if
+            all rasters do not have the same nodata value, and none does not do anything for nodata.
+
+    Returns:
+        3D array with shape (total bands, height, width).
+
+    Raises:
+        NonMatchingRasterMetadataException: If input rasters do not have same grid properties or nodata_handling
+            is set to raise exception and mismatching nodata is encountered.
+    """
+    bands = []
+    nodata_values = []
+    profiles = []
+
+    for raster_file in raster_files:
+        with rasterio.open(raster_file) as raster:
+            # Read all bands from each raster
+            profile = raster.profile
+            for i in range(1, raster.count + 1):
+                band_data = raster.read(i)
+
+                if nodata_handling == "convert_to_nan":
+                    band_data[band_data == profile["nodata"]] = np.nan
+                elif nodata_handling == "unify":
+                    band_data[band_data == profile["nodata"]] = -9999
+                    profile["nodata"] = -9999
+                elif nodata_handling == "raise_exception":
+                    nodata_values.append(profile["nodata"])
+                    if len(set(nodata_values)) > 1:
+                        raise exceptions.NonMatchingRasterMetadataException("Input rasters have varying nodata values.")
+                elif nodata_handling == "none":
+                    pass
+                bands.append(band_data)
+            profiles.append(profile)
+
+    if not check_raster_grids(profiles, same_extent=True):
+        raise exceptions.NonMatchingRasterMetadataException("Input rasters should have the same properties.")
+
+    # Stack all bands into a single 3D array
+    return np.stack(bands, axis=0), profiles
+
+
 @beartype
 def read_vector(file_path: Path) -> gpd.GeoDataFrame:
     """Read a vector file to a GeoDataFrame.
@@ -82,7 +138,7 @@ def read_vector(file_path: Path) -> gpd.GeoDataFrame:
     try:
         data = gpd.read_file(file_path)
     except (ValueError, OSError):
-        raise FileReadError(f"Failed to read vector data from {file_path}.")
+        raise exceptions.FileReadError(f"Failed to read vector data from {file_path}.")
     return data
 
 
@@ -102,5 +158,5 @@ def read_tabular(file_path: Path) -> pd.DataFrame:
     try:
         data = pd.read_csv(file_path)
     except pd.errors.ParserError:
-        raise FileReadError(f"Failed to read tabular data from {file_path}.")
+        raise exceptions.FileReadError(f"Failed to read tabular data from {file_path}.")
     return data

From 6ee8a5d628325aa7a045a23890edb512f48f5ecd Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 12 Jan 2024 12:25:46 +0200
Subject: [PATCH 2/5] Add new exception for raster metadata mismatches

---
 eis_toolkit/exceptions.py                            |  4 ++++
 eis_toolkit/raster_processing/unique_combinations.py | 12 ++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/eis_toolkit/exceptions.py b/eis_toolkit/exceptions.py
index c7e42e1c..bb665a8b 100644
--- a/eis_toolkit/exceptions.py
+++ b/eis_toolkit/exceptions.py
@@ -74,6 +74,10 @@ class NonMatchingCrsException(Exception):
     """Exception error class for CRS mismatches."""
 
 
+class NonMatchingRasterMetadataException(Exception):
+    """Exception error class for raster metadata mismatches."""
+
+
 class NonMatchingParameterLengthsException(Exception):
     """Exception error class for parameters with different lenghts."""
 
diff --git a/eis_toolkit/raster_processing/unique_combinations.py b/eis_toolkit/raster_processing/unique_combinations.py
index 58a1af80..357b312b 100644
--- a/eis_toolkit/raster_processing/unique_combinations.py
+++ b/eis_toolkit/raster_processing/unique_combinations.py
@@ -3,7 +3,7 @@
 from beartype import beartype
 from beartype.typing import Sequence, Tuple
 
-from eis_toolkit.exceptions import InvalidParameterValueException
+from eis_toolkit.exceptions import InvalidParameterValueException, NonMatchingRasterMetadataException
 from eis_toolkit.utilities.checks.raster import check_raster_grids
 
 
@@ -37,8 +37,12 @@ def unique_combinations(  # type: ignore[no-any-unimported]
         raster_list: Rasters to be used for finding combinations.
 
     Returns:
-        out_image: Combinations of rasters.
-        out_meta: The metadata of the first raster in raster_list.
+        Combinations of rasters.
+        The metadata of the first raster in raster_list.
+
+    Raises:
+        InvalidParameterValueException: Total number of raster bands is 1.
+        NonMatchingRasterMetadataException: Input raster grids do not have the grid properties.
     """
     bands = []
     out_meta = raster_list[0].meta
@@ -54,7 +58,7 @@ def unique_combinations(  # type: ignore[no-any-unimported]
         raise InvalidParameterValueException("Expected to have more bands than 1")
 
     if check_raster_grids(raster_profiles) is not True:
-        raise InvalidParameterValueException("Expected raster grids to be of same shape")
+        raise NonMatchingRasterMetadataException("Expected raster grids to be have the same grid properties.")
 
     out_image = _unique_combinations(bands)
     return out_image, out_meta

From 2668cbe56ff62e1d09441925f5b4f5ad839a3848 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 12 Jan 2024 12:26:08 +0200
Subject: [PATCH 3/5] Add CLI functions for PCA raster and vector separately

---
 eis_toolkit/cli.py | 85 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 8 deletions(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index 33746034..87033b7f 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -10,6 +10,7 @@
 from typing import List, Optional, Tuple
 
 import geopandas as gpd
+import numpy as np
 import pandas as pd
 import rasterio
 import typer
@@ -142,6 +143,13 @@ class GradientBoostingRegressorLosses(str, Enum):
     quantile = "quantile"
 
 
+class NodataHandling(str, Enum):
+    """Nodata handling choices."""
+
+    replace = "replace"
+    remove = "remove"
+
+
 RESAMPLING_MAPPING = {
     "nearest": warp.Resampling.nearest,
     "bilinear": warp.Resampling.bilinear,
@@ -305,25 +313,86 @@ def parallel_coordinates_cli(
     typer.echo("Parallel coordinates plot completed" + echo_str_end)
 
 
-# PCA
+# PCA FOR RASTER DATA
+@app.command()
+def compute_pca_raster_cli(
+    input_rasters: Annotated[List[Path], INPUT_FILE_OPTION],
+    output_raster: Annotated[Path, OUTPUT_FILE_OPTION],
+    number_of_components: int = typer.Option(),
+    # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
+    nodata_handling: NodataHandling = NodataHandling.remove,
+    # NOTE: Omitted nodata parameter. Should use raster nodata.
+):
+    """Compute defined number of principal components for raster data."""
+    from eis_toolkit.exploratory_analyses.pca import compute_pca
+    from eis_toolkit.utilities.file_io import read_and_stack_rasters
+
+    typer.echo("Progress: 10%")
+
+    stacked_array, profiles = read_and_stack_rasters(input_rasters, nodata_handling="convert_to_nan")
+    typer.echo("Progress: 25%")
+
+    pca_array, variance_ratios = compute_pca(
+        data=stacked_array, number_of_components=number_of_components, nodata_handling=nodata_handling
+    )
+
+    # Fill np.nan with nodata before writing data to raster
+    pca_array[pca_array == np.nan] = -9999
+    out_profile = profiles[0]
+    out_profile["nodata"] = -9999
+
+    # Create dictionary from the variance ratios array
+    variances_ratios_dict = {}
+    for i, variance_ratio in enumerate(variance_ratios):
+        name = "PC " + str(i) + " explained variance"
+        variances_ratios_dict[name] = variance_ratio
+    json_str = json.dumps(variances_ratios_dict)
+
+    with rasterio.open(output_raster, "w", **out_profile) as dst:
+        dst.write(pca_array)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo(f"PCA computation (raster) completed, output raster saved to {output_raster}.")
+
+
+# PCA FOR VECTOR DATA
 @app.command()
-def compute_pca_cli(
+def compute_pca_vector_cli(
     input_vector: Annotated[Path, INPUT_FILE_OPTION],
-    output_file: Annotated[Path, OUTPUT_FILE_OPTION],
+    output_vector: Annotated[Path, OUTPUT_FILE_OPTION],
     number_of_components: int = typer.Option(),
+    columns: Annotated[List[str], typer.Option()] = None,
+    # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
+    nodata_handling: NodataHandling = NodataHandling.remove,
+    nodata: float = None,
 ):
-    """Compute principal components for the input data."""
+    """Compute defined number of principal components for vector data."""
     from eis_toolkit.exploratory_analyses.pca import compute_pca
 
     typer.echo("Progress: 10%")
 
-    geodataframe = gpd.read_file(input_vector)  # TODO: Check if gdf to df handling in tool itself
-    dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
+    gdf = gpd.read_file(input_vector)
     typer.echo("Progress: 25%")
 
-    pca_df, variance_ratios = compute_pca(data=dataframe, number_of_components=number_of_components)
+    pca_gdf, variance_ratios = compute_pca(
+        data=gdf,
+        number_of_components=number_of_components,
+        columns=columns,
+        nodata_handling=nodata_handling,
+        nodata=nodata,
+    )
 
-    pca_df.to_csv(output_file)
+    # Create dictionary from the variance ratios array
+    variances_ratios_dict = {}
+    for i, variance_ratio in enumerate(variance_ratios):
+        name = "PC " + str(i) + " explained variance"
+        variances_ratios_dict[name] = variance_ratio
+    json_str = json.dumps(variances_ratios_dict)
+
+    pca_gdf.to_file(output_vector)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo(f"PCA computation (vector) completed, output vector saved to {output_vector}.")
 
 
 # DESCRIPTIVE STATISTICS (RASTER)

From 1ad58f5e1ce8d73a452ea3f9e2bb36e0f68cb773 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Mon, 15 Jan 2024 10:31:08 +0200
Subject: [PATCH 4/5] Fixed PCA raster CLI to update metadata correctl and
 change input raster option to argument

---
 eis_toolkit/cli.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index 87033b7f..8f5b2a59 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -316,7 +316,7 @@ def parallel_coordinates_cli(
 # PCA FOR RASTER DATA
 @app.command()
 def compute_pca_raster_cli(
-    input_rasters: Annotated[List[Path], INPUT_FILE_OPTION],
+    input_rasters: INPUT_FILES_ARGUMENT,
     output_raster: Annotated[Path, OUTPUT_FILE_OPTION],
     number_of_components: int = typer.Option(),
     # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
@@ -341,6 +341,9 @@ def compute_pca_raster_cli(
     out_profile = profiles[0]
     out_profile["nodata"] = -9999
 
+    # Update nr of bands
+    out_profile["count"] = number_of_components
+
     # Create dictionary from the variance ratios array
     variances_ratios_dict = {}
     for i, variance_ratio in enumerate(variance_ratios):
@@ -350,6 +353,7 @@ def compute_pca_raster_cli(
 
     with rasterio.open(output_raster, "w", **out_profile) as dst:
         dst.write(pca_array)
+
     typer.echo("Progress: 100%")
     typer.echo(f"Results: {json_str}")
     typer.echo(f"PCA computation (raster) completed, output raster saved to {output_raster}.")

From 2af4980d68a87f05c055dd500959726c36618ec6 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Thu, 18 Jan 2024 09:42:17 +0200
Subject: [PATCH 5/5] Add empty list check for columns arg

---
 eis_toolkit/exploratory_analyses/pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eis_toolkit/exploratory_analyses/pca.py b/eis_toolkit/exploratory_analyses/pca.py
index 2cb2ff39..c1a36a86 100644
--- a/eis_toolkit/exploratory_analyses/pca.py
+++ b/eis_toolkit/exploratory_analyses/pca.py
@@ -145,7 +145,7 @@ def compute_pca(
             geometries = data.geometry
             crs = data.crs
             df = df.drop(columns=["geometry"])
-        if columns is not None:
+        if columns is not None and columns != []:
             if not check_columns_valid(df, columns):
                 raise exceptions.InvalidColumnException("All selected columns were not found in the input DataFrame.")
             df = df[columns]