From 35fa8aabb98c016d53954705987075ca6e298066 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Mon, 13 Nov 2023 09:26:28 -0500 Subject: [PATCH] Use kwargs for dataset read. --- src/hipscat/io/file_io/file_io.py | 14 ++++---------- src/hipscat/io/write_metadata.py | 13 ++++++++++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/hipscat/io/file_io/file_io.py b/src/hipscat/io/file_io/file_io.py index 5eb0b0bd..49ca30e5 100644 --- a/src/hipscat/io/file_io/file_io.py +++ b/src/hipscat/io/file_io/file_io.py @@ -195,20 +195,15 @@ def read_parquet_metadata( return parquet_file -def read_parquet_dataset(dir_pointer: FilePointer, storage_options: Union[Dict[Any, Any], None] = None): +def read_parquet_dataset( + dir_pointer: FilePointer, storage_options: Union[Dict[Any, Any], None] = None, **kwargs +): """Read parquet dataset from directory pointer. Args: dir_pointer: location of file to read metadata from storage_options: dictionary that contains abstract filesystem credentials """ - - ignore_prefixes = [ - "intermediate", - "_common_metadata", - "_metadata", - ] - file_system, dir_pointer = get_fs(file_pointer=dir_pointer, storage_options=storage_options) # pyarrow.dataset requires the pointer not lead with a slash @@ -217,9 +212,8 @@ def read_parquet_dataset(dir_pointer: FilePointer, storage_options: Union[Dict[A dataset = pds.dataset( dir_pointer, filesystem=file_system, - exclude_invalid_files=True, format="parquet", - ignore_prefixes=ignore_prefixes, + **kwargs, ) return dataset diff --git a/src/hipscat/io/write_metadata.py b/src/hipscat/io/write_metadata.py index ed4c44cd..d1ed831c 100644 --- a/src/hipscat/io/write_metadata.py +++ b/src/hipscat/io/write_metadata.py @@ -117,7 +117,18 @@ def write_parquet_metadata(catalog_path, storage_options: Union[Dict[Any, Any], storage_options: dictionary that contains abstract filesystem credentials """ - dataset = file_io.read_parquet_dataset(catalog_path, storage_options=storage_options) + ignore_prefixes = [ + "intermediate", + "_common_metadata", + "_metadata", + ] + + dataset = file_io.read_parquet_dataset( + catalog_path, + storage_options=storage_options, + ignore_prefixes=ignore_prefixes, + exclude_invalid_files=True, + ) metadata_collector = [] for hips_file in dataset.files: