Skip to content

Commit

Permalink
Merge pull request #284 from GispoCoding/283-modify-pca-cli-function
Browse files Browse the repository at this point in the history
283 modify PCA CLI function
  • Loading branch information
nmaarnio authored Jan 18, 2024
2 parents 1e81919 + 2af4980 commit 4fed630
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 20 deletions.
89 changes: 81 additions & 8 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import List, Optional, Tuple

import geopandas as gpd
import numpy as np
import pandas as pd
import rasterio
import typer
Expand Down Expand Up @@ -142,6 +143,13 @@ class GradientBoostingRegressorLosses(str, Enum):
quantile = "quantile"


class NodataHandling(str, Enum):
"""Nodata handling choices."""

replace = "replace"
remove = "remove"


RESAMPLING_MAPPING = {
"nearest": warp.Resampling.nearest,
"bilinear": warp.Resampling.bilinear,
Expand Down Expand Up @@ -305,25 +313,90 @@ def parallel_coordinates_cli(
typer.echo("Parallel coordinates plot completed" + echo_str_end)


# PCA
# PCA FOR RASTER DATA
@app.command()
def compute_pca_raster_cli(
input_rasters: INPUT_FILES_ARGUMENT,
output_raster: Annotated[Path, OUTPUT_FILE_OPTION],
number_of_components: int = typer.Option(),
# NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
nodata_handling: NodataHandling = NodataHandling.remove,
# NOTE: Omitted nodata parameter. Should use raster nodata.
):
"""Compute defined number of principal components for raster data."""
from eis_toolkit.exploratory_analyses.pca import compute_pca
from eis_toolkit.utilities.file_io import read_and_stack_rasters

typer.echo("Progress: 10%")

stacked_array, profiles = read_and_stack_rasters(input_rasters, nodata_handling="convert_to_nan")
typer.echo("Progress: 25%")

pca_array, variance_ratios = compute_pca(
data=stacked_array, number_of_components=number_of_components, nodata_handling=nodata_handling
)

# Fill np.nan with nodata before writing data to raster
pca_array[pca_array == np.nan] = -9999
out_profile = profiles[0]
out_profile["nodata"] = -9999

# Update nr of bands
out_profile["count"] = number_of_components

# Create dictionary from the variance ratios array
variances_ratios_dict = {}
for i, variance_ratio in enumerate(variance_ratios):
name = "PC " + str(i) + " explained variance"
variances_ratios_dict[name] = variance_ratio
json_str = json.dumps(variances_ratios_dict)

with rasterio.open(output_raster, "w", **out_profile) as dst:
dst.write(pca_array)

typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo(f"PCA computation (raster) completed, output raster saved to {output_raster}.")


# PCA FOR VECTOR DATA
@app.command()
def compute_pca_cli(
def compute_pca_vector_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
output_file: Annotated[Path, OUTPUT_FILE_OPTION],
output_vector: Annotated[Path, OUTPUT_FILE_OPTION],
number_of_components: int = typer.Option(),
columns: Annotated[List[str], typer.Option()] = None,
# NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
nodata_handling: NodataHandling = NodataHandling.remove,
nodata: float = None,
):
"""Compute principal components for the input data."""
"""Compute defined number of principal components for vector data."""
from eis_toolkit.exploratory_analyses.pca import compute_pca

typer.echo("Progress: 10%")

geodataframe = gpd.read_file(input_vector) # TODO: Check if gdf to df handling in tool itself
dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
gdf = gpd.read_file(input_vector)
typer.echo("Progress: 25%")

pca_df, variance_ratios = compute_pca(data=dataframe, number_of_components=number_of_components)
pca_gdf, variance_ratios = compute_pca(
data=gdf,
number_of_components=number_of_components,
columns=columns,
nodata_handling=nodata_handling,
nodata=nodata,
)

# Create dictionary from the variance ratios array
variances_ratios_dict = {}
for i, variance_ratio in enumerate(variance_ratios):
name = "PC " + str(i) + " explained variance"
variances_ratios_dict[name] = variance_ratio
json_str = json.dumps(variances_ratios_dict)

pca_df.to_csv(output_file)
pca_gdf.to_file(output_vector)
typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo(f"PCA computation (vector) completed, output vector saved to {output_vector}.")


# DESCRIPTIVE STATISTICS (RASTER)
Expand Down
4 changes: 4 additions & 0 deletions eis_toolkit/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ class NonMatchingCrsException(Exception):
"""Exception error class for CRS mismatches."""


class NonMatchingRasterMetadataException(Exception):
"""Exception error class for raster metadata mismatches."""


class NonMatchingParameterLengthsException(Exception):
"""Exception error class for parameters with different lenghts."""

Expand Down
2 changes: 1 addition & 1 deletion eis_toolkit/exploratory_analyses/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def compute_pca(
geometries = data.geometry
crs = data.crs
df = df.drop(columns=["geometry"])
if columns is not None:
if columns is not None and columns != []:
if not check_columns_valid(df, columns):
raise exceptions.InvalidColumnException("All selected columns were not found in the input DataFrame.")
df = df[columns]
Expand Down
12 changes: 8 additions & 4 deletions eis_toolkit/raster_processing/unique_combinations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from beartype import beartype
from beartype.typing import Sequence, Tuple

from eis_toolkit.exceptions import InvalidParameterValueException
from eis_toolkit.exceptions import InvalidParameterValueException, NonMatchingRasterMetadataException
from eis_toolkit.utilities.checks.raster import check_raster_grids


Expand Down Expand Up @@ -37,8 +37,12 @@ def unique_combinations( # type: ignore[no-any-unimported]
raster_list: Rasters to be used for finding combinations.
Returns:
out_image: Combinations of rasters.
out_meta: The metadata of the first raster in raster_list.
Combinations of rasters.
The metadata of the first raster in raster_list.
Raises:
InvalidParameterValueException: Total number of raster bands is 1.
NonMatchingRasterMetadataException: Input raster grids do not have the grid properties.
"""
bands = []
out_meta = raster_list[0].meta
Expand All @@ -54,7 +58,7 @@ def unique_combinations( # type: ignore[no-any-unimported]
raise InvalidParameterValueException("Expected to have more bands than 1")

if check_raster_grids(raster_profiles) is not True:
raise InvalidParameterValueException("Expected raster grids to be of same shape")
raise NonMatchingRasterMetadataException("Expected raster grids to be have the same grid properties.")

out_image = _unique_combinations(bands)
return out_image, out_meta
70 changes: 63 additions & 7 deletions eis_toolkit/utilities/file_io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from pathlib import Path
from typing import Union

import geopandas as gpd
import numpy as np
import pandas as pd
import rasterio
from beartype import beartype
from beartype.typing import Literal, Sequence, Tuple, Union

from eis_toolkit.exceptions import FileReadError
from eis_toolkit import exceptions
from eis_toolkit.utilities.checks.raster import check_raster_grids


@beartype
Expand All @@ -29,7 +31,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr
# Try to read as a raster first
try:
data = read_raster(file_path)
except FileReadError:
except exceptions.FileReadError:

# Try to read as a GeoDataFrame
try:
Expand All @@ -41,7 +43,7 @@ def read_file(file_path: Path) -> Union[rasterio.io.DatasetReader, gpd.GeoDataFr
data = pd.read_csv(file_path)
except pd.errors.ParserError:
# If none of the readers succeeded, raise an exception
raise FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe")
raise exceptions.FileReadError(f"Failed to file {file_path} as raster, geodataframe or dataframe")

return data

Expand All @@ -62,10 +64,64 @@ def read_raster(file_path: Path) -> rasterio.io.DatasetReader:
try:
data = rasterio.open(file_path)
except rasterio.errors.RasterioIOError:
raise FileReadError(f"Failed to read raster data from {file_path}.")
raise exceptions.FileReadError(f"Failed to read raster data from {file_path}.")
return data


def read_and_stack_rasters(
raster_files: Sequence[Path],
nodata_handling: Literal["convert_to_nan", "unify", "raise_exception", "none"] = "convert_to_nan",
) -> Tuple[np.ndarray, Sequence[rasterio.profiles.Profile]]:
"""
Read multiple raster files and stack all their bands into a single 3D array.
Checks that all rasters have the same grid properties. If there are any differences, exception is raised.
Args:
raster_files: List of paths to raster files.
nodata_handling: How to handle raster nodata. convert_to_nan to changes all nodata to np.nan, unify
changes all rasters to use -9999 as their nodata value, raise_exception raises an exception if
all rasters do not have the same nodata value, and none does not do anything for nodata.
Returns:
3D array with shape (total bands, height, width).
Raises:
NonMatchingRasterMetadataException: If input rasters do not have same grid properties or nodata_handling
is set to raise exception and mismatching nodata is encountered.
"""
bands = []
nodata_values = []
profiles = []

for raster_file in raster_files:
with rasterio.open(raster_file) as raster:
# Read all bands from each raster
profile = raster.profile
for i in range(1, raster.count + 1):
band_data = raster.read(i)

if nodata_handling == "convert_to_nan":
band_data[band_data == profile["nodata"]] = np.nan
elif nodata_handling == "unify":
band_data[band_data == profile["nodata"]] = -9999
profile["nodata"] = -9999
elif nodata_handling == "raise_exception":
nodata_values.append(profile["nodata"])
if len(set(nodata_values)) > 1:
raise exceptions.NonMatchingRasterMetadataException("Input rasters have varying nodata values.")
elif nodata_handling == "none":
pass
bands.append(band_data)
profiles.append(profile)

if not check_raster_grids(profiles, same_extent=True):
raise exceptions.NonMatchingRasterMetadataException("Input rasters should have the same properties.")

# Stack all bands into a single 3D array
return np.stack(bands, axis=0), profiles


@beartype
def read_vector(file_path: Path) -> gpd.GeoDataFrame:
"""Read a vector file to a GeoDataFrame.
Expand All @@ -82,7 +138,7 @@ def read_vector(file_path: Path) -> gpd.GeoDataFrame:
try:
data = gpd.read_file(file_path)
except (ValueError, OSError):
raise FileReadError(f"Failed to read vector data from {file_path}.")
raise exceptions.FileReadError(f"Failed to read vector data from {file_path}.")
return data


Expand All @@ -102,5 +158,5 @@ def read_tabular(file_path: Path) -> pd.DataFrame:
try:
data = pd.read_csv(file_path)
except pd.errors.ParserError:
raise FileReadError(f"Failed to read tabular data from {file_path}.")
raise exceptions.FileReadError(f"Failed to read tabular data from {file_path}.")
return data

0 comments on commit 4fed630

Please sign in to comment.