diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index f3ea7824..bf00ea40 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -1,4 +1,6 @@ """Container class to hold per-partition metadata""" +from __future__ import annotations + from typing import List import numpy as np @@ -54,8 +56,8 @@ def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: di """Generate parquet metadata, using the known partitions. Args: - catalog_path (str): base path for the catalog - storage_options: dictionary that contains abstract filesystem credentials + catalog_path (FilePointer): base path for the catalog + storage_options (dict): dictionary that contains abstract filesystem credentials """ batches = [ [ @@ -74,12 +76,12 @@ def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: di write_parquet_metadata_for_batches(batches, catalog_path, storage_options) @classmethod - def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None): + def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None) -> PartitionInfo: """Read partition info from a `_metadata` file to create an object Args: - metadata_file: FilePointer to the `_metadata` file - storage_options: dictionary that contains abstract filesystem credentials + metadata_file (FilePointer): FilePointer to the `_metadata` file + storage_options (dict): dictionary that contains abstract filesystem credentials Returns: A `PartitionInfo` object with the data from the file @@ -99,12 +101,12 @@ def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None return cls(pixel_list) @classmethod - def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = None): + def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = None) -> PartitionInfo: """Read partition info from a `partition_info.csv` file to create an object Args: - partition_info_file: FilePointer to the `partition_info.csv` file - storage_options: dictionary that contains abstract filesystem credentials + partition_info_file (FilePointer): FilePointer to the `partition_info.csv` file + storage_options (dict): dictionary that contains abstract filesystem credentials Returns: A `PartitionInfo` object with the data from the file @@ -144,7 +146,7 @@ def as_dataframe(self): return pd.DataFrame.from_dict(partition_info_dict) @classmethod - def from_healpix(cls, healpix_pixels: List[HealpixPixel]): + def from_healpix(cls, healpix_pixels: List[HealpixPixel]) -> PartitionInfo: """Create a partition info object from a list of constituent healpix pixels. Args: diff --git a/src/hipscat/io/parquet_metadata.py b/src/hipscat/io/parquet_metadata.py index 415571d1..5e6fb4bb 100644 --- a/src/hipscat/io/parquet_metadata.py +++ b/src/hipscat/io/parquet_metadata.py @@ -13,9 +13,16 @@ from hipscat.pixel_math.healpix_pixel_function import get_pixel_argsort -def row_group_stat_single_value(row_group, stat_key): +def row_group_stat_single_value(row_group, stat_key: str): """Convenience method to find the min and max inside a statistics dictionary, - and raise an error if they're unequal.""" + and raise an error if they're unequal. + + Args: + row_group: dataset fragment row group + stat_key (str): column name of interest. + Returns: + The value of the specified row group statistic + """ if stat_key not in row_group.statistics: raise ValueError(f"row group doesn't have expected key {stat_key}") stat_dict = row_group.statistics[stat_key] @@ -47,11 +54,12 @@ def get_healpix_pixel_from_metadata(metadata) -> HealpixPixel: return HealpixPixel(order, pixel) -def write_parquet_metadata( - catalog_path, order_by_healpix=True, storage_options: dict = None, output_path: str = None -): +def write_parquet_metadata(catalog_path: str, order_by_healpix=True, storage_options: dict = None, output_path: str = None): """Generate parquet metadata, using the already-partitioned parquet files - for this catalog + for this catalog. + + For more information on the general parquet metadata files, and why we write them, see + https://arrow.apache.org/docs/python/parquet.html#writing-metadata-and-common-metadata-files Args: catalog_path (str): base path for the catalog @@ -80,6 +88,8 @@ def write_parquet_metadata( for hips_file in dataset.files: hips_file_pointer = file_io.get_file_pointer_from_path(hips_file, include_protocol=catalog_path) single_metadata = file_io.read_parquet_metadata(hips_file_pointer, storage_options=storage_options) + + # Users must set the file path of each chunk before combining the metadata. relative_path = hips_file[len(catalog_path) :] single_metadata.set_file_path(relative_path) @@ -118,6 +128,7 @@ def write_parquet_metadata_for_batches( ): """Write parquet metadata files for some pyarrow table batches. This writes the batches to a temporary parquet dataset using local storage, and + generates the metadata for the partitioned catalog parquet files. Args: batches (List[pa.RecordBatch]): create one batch per group of data (partition or row group) @@ -133,7 +144,7 @@ def write_parquet_metadata_for_batches( write_parquet_metadata(temp_pq_file, storage_options=storage_options, output_path=output_path) -def read_row_group_fragments(metadata_file, storage_options: dict = None): +def read_row_group_fragments(metadata_file: str, storage_options: dict = None): """Generator for metadata fragment row groups in a parquet metadata file. Args: diff --git a/src/hipscat/io/validation.py b/src/hipscat/io/validation.py index 997a4d15..877e7a98 100644 --- a/src/hipscat/io/validation.py +++ b/src/hipscat/io/validation.py @@ -7,7 +7,7 @@ def is_valid_catalog(pointer: FilePointer) -> bool: """Checks if a catalog is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if both the catalog_info and partition_info files are @@ -16,11 +16,11 @@ def is_valid_catalog(pointer: FilePointer) -> bool: return is_catalog_info_valid(pointer) and (is_partition_info_valid(pointer) or is_metadata_valid(pointer)) -def is_catalog_info_valid(pointer): +def is_catalog_info_valid(pointer: FilePointer) -> bool: """Checks if catalog_info is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the catalog_info file exists, and it is correctly formatted, @@ -34,11 +34,11 @@ def is_catalog_info_valid(pointer): return is_valid -def is_partition_info_valid(pointer): +def is_partition_info_valid(pointer: FilePointer) -> bool: """Checks if partition_info is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the partition_info file exists, False otherwise @@ -48,11 +48,11 @@ def is_partition_info_valid(pointer): return partition_info_exists -def is_metadata_valid(pointer): +def is_metadata_valid(pointer: FilePointer) -> bool: """Checks if _metadata is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the _metadata file exists, False otherwise diff --git a/src/hipscat/pixel_math/healpix_pixel.py b/src/hipscat/pixel_math/healpix_pixel.py index b6a1b70c..046102dd 100644 --- a/src/hipscat/pixel_math/healpix_pixel.py +++ b/src/hipscat/pixel_math/healpix_pixel.py @@ -82,7 +82,7 @@ def convert_to_higher_order(self, delta_order: int) -> List[HealpixPixel]: return pixels @property - def dir(self): + def dir(self) -> int: """Directory number for the pixel. This is necessary for file systems that limit to 10,000 subdirectories.