From 6be660d57c921aec1d581a3f20c0a2d28b6c10fd Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 14 Nov 2023 10:49:30 -0500 Subject: [PATCH 1/8] Load partition info from metadata file. --- .../healpix_dataset/healpix_dataset.py | 27 +++-- src/hipscat/catalog/partition_info.py | 56 ++++++++- src/hipscat/io/__init__.py | 5 + src/hipscat/io/parquet_metadata.py | 98 +++++++++++++++ src/hipscat/io/validation.py | 18 ++- src/hipscat/io/write_metadata.py | 27 +---- src/hipscat/pixel_math/healpix_pixel.py | 19 ++- tests/conftest.py | 26 ++++ tests/data/pixel_trees/aligned_2_3_inner.csv | 11 -- tests/data/pixel_trees/aligned_2_3_left.csv | 13 -- tests/data/pixel_trees/aligned_2_3_outer.csv | 18 --- tests/data/pixel_trees/aligned_2_3_right.csv | 16 --- tests/data/small_sky/_common_metadata | Bin 0 -> 3997 bytes tests/data/small_sky/_metadata | Bin 0 -> 5130 bytes tests/data/small_sky_order1/_common_metadata | Bin 0 -> 445 bytes tests/data/small_sky_order1/_metadata | Bin 0 -> 2202 bytes tests/data/small_sky_source/_common_metadata | Bin 0 -> 5506 bytes tests/data/small_sky_source/_metadata | Bin 0 -> 29243 bytes tests/data/small_sky_source/point_map.fits | Bin 0 -> 8640 bytes .../_common_metadata | Bin 0 -> 556 bytes .../small_sky_to_small_sky_order1/_metadata | Bin 0 -> 2929 bytes tests/hipscat/catalog/test_catalog.py | 11 +- tests/hipscat/catalog/test_partition_info.py | 25 +++- tests/hipscat/io/file_io/test_file_io.py | 8 -- .../hipscat/io/file_io/test_file_pointers.py | 13 +- tests/hipscat/io/test_parquet_metadata.py | 113 ++++++++++++++++++ tests/hipscat/io/test_validation.py | 6 +- tests/hipscat/pixel_tree/conftest.py | 106 +++++++++++----- 28 files changed, 459 insertions(+), 157 deletions(-) create mode 100644 src/hipscat/io/parquet_metadata.py delete mode 100644 tests/data/pixel_trees/aligned_2_3_inner.csv delete mode 100644 tests/data/pixel_trees/aligned_2_3_left.csv delete mode 100644 tests/data/pixel_trees/aligned_2_3_outer.csv delete mode 100644 tests/data/pixel_trees/aligned_2_3_right.csv create mode 100644 tests/data/small_sky/_common_metadata create mode 100644 tests/data/small_sky/_metadata create mode 100644 tests/data/small_sky_order1/_common_metadata create mode 100644 tests/data/small_sky_order1/_metadata create mode 100644 tests/data/small_sky_source/_common_metadata create mode 100644 tests/data/small_sky_source/_metadata create mode 100644 tests/data/small_sky_source/point_map.fits create mode 100644 tests/data/small_sky_to_small_sky_order1/_common_metadata create mode 100644 tests/data/small_sky_to_small_sky_order1/_metadata create mode 100644 tests/hipscat/io/test_parquet_metadata.py diff --git a/src/hipscat/catalog/healpix_dataset/healpix_dataset.py b/src/hipscat/catalog/healpix_dataset/healpix_dataset.py index d9b98025..18374e37 100644 --- a/src/hipscat/catalog/healpix_dataset/healpix_dataset.py +++ b/src/hipscat/catalog/healpix_dataset/healpix_dataset.py @@ -79,18 +79,29 @@ def _get_pixel_tree_from_pixels(pixels: PixelInputTypes) -> PixelTree: @classmethod def _read_args( - cls, catalog_base_dir: FilePointer, storage_options: Union[Dict[Any, Any], None] = None + cls, + catalog_base_dir: FilePointer, + storage_options: Union[Dict[Any, Any], None] = None, ) -> Tuple[CatalogInfoClass, PartitionInfo]: args = super()._read_args(catalog_base_dir, storage_options=storage_options) - partition_info_file = paths.get_partition_info_pointer(catalog_base_dir) - partition_info = PartitionInfo.read_from_file(partition_info_file, storage_options=storage_options) + metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir) + if file_io.does_file_or_directory_exist(metadata_file, storage_options=storage_options): + partition_info = PartitionInfo.read_from_file(metadata_file, storage_options=storage_options) + else: + partition_info_file = paths.get_partition_info_pointer(catalog_base_dir) + partition_info = PartitionInfo.read_from_csv(partition_info_file, storage_options=storage_options) return args + (partition_info,) @classmethod - def _check_files_exist( - cls, catalog_base_dir: FilePointer, storage_options: Union[Dict[Any, Any], None] = None - ): + def _check_files_exist(cls, catalog_base_dir: FilePointer, storage_options: dict = None): super()._check_files_exist(catalog_base_dir, storage_options=storage_options) + partition_info_file = paths.get_partition_info_pointer(catalog_base_dir) - if not file_io.does_file_or_directory_exist(partition_info_file, storage_options=storage_options): - raise FileNotFoundError(f"No partition info found where expected: {str(partition_info_file)}") + metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir) + if not ( + file_io.does_file_or_directory_exist(partition_info_file, storage_options=storage_options) + or file_io.does_file_or_directory_exist(metadata_file, storage_options=storage_options) + ): + raise FileNotFoundError( + f"_metadata or partition info file is required in catalog directory {catalog_base_dir}" + ) diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index b87a3006..3230cdbf 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -3,8 +3,14 @@ import numpy as np import pandas as pd +import pyarrow as pa from hipscat.io import FilePointer, file_io +from hipscat.io.parquet_metadata import ( + read_row_group_fragments, + row_group_stat_single_value, + write_parquet_metadata_for_batches, +) from hipscat.pixel_math import HealpixPixel @@ -44,10 +50,54 @@ def write_to_file(self, partition_info_file: FilePointer): """ file_io.write_dataframe_to_csv(self.as_dataframe(), partition_info_file, index=False) + def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: dict = None): + """Generate parquet metadata, using the known partitions. + + Args: + catalog_path (str): base path for the catalog + storage_options: dictionary that contains abstract filesystem credentials + """ + batches = [ + pa.RecordBatch.from_arrays( + [[pixel.order], [pixel.dir], [pixel.pixel]], + names=[ + self.METADATA_ORDER_COLUMN_NAME, + self.METADATA_DIR_COLUMN_NAME, + self.METADATA_PIXEL_COLUMN_NAME, + ], + ) + for pixel in self.get_healpix_pixels() + ] + + write_parquet_metadata_for_batches(batches, catalog_path, storage_options) + + @classmethod + def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None): + """Read partition info from a `_metadata` file to create an object + + Args: + metadata_file: FilePointer to the `_metadata` file + storage_options: dictionary that contains abstract filesystem credentials + + Returns: + A `PartitionInfo` object with the data from the file + """ + pixel_list = [ + HealpixPixel( + row_group_stat_single_value(row_group,cls.METADATA_ORDER_COLUMN_NAME), + row_group_stat_single_value(row_group,cls.METADATA_PIXEL_COLUMN_NAME), + ) + for row_group in read_row_group_fragments(metadata_file, storage_options) + ] + ## Remove duplicates, preserving order. + ## In the case of association partition join info, we may have multiple entries + ## for the primary order/pixels. + pixel_list = list(dict.fromkeys(pixel_list)) + + return cls(pixel_list) + @classmethod - def read_from_file( - cls, partition_info_file: FilePointer, storage_options: Union[Dict[Any, Any], None] = None - ): + def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = None): """Read partition info from a `partition_info.csv` file to create an object Args: diff --git a/src/hipscat/io/__init__.py b/src/hipscat/io/__init__.py index 3f046d75..78098bf5 100644 --- a/src/hipscat/io/__init__.py +++ b/src/hipscat/io/__init__.py @@ -1,6 +1,11 @@ """Utilities for reading and writing catalog files""" from .file_io import FilePointer, get_file_pointer_from_path +from .parquet_metadata import ( + read_row_group_fragments, + row_group_stat_single_value, + write_parquet_metadata_for_batches, +) from .paths import ( create_hive_directory_name, create_hive_parquet_file_name, diff --git a/src/hipscat/io/parquet_metadata.py b/src/hipscat/io/parquet_metadata.py new file mode 100644 index 00000000..97037665 --- /dev/null +++ b/src/hipscat/io/parquet_metadata.py @@ -0,0 +1,98 @@ +"""Utility functions for handling parquet metadata files""" +import tempfile +from typing import List + +import pyarrow as pa +import pyarrow.dataset as pds +import pyarrow.parquet as pq + +from hipscat.io import file_io, paths +from hipscat.io.file_io.file_pointer import get_fs, strip_leading_slash_for_pyarrow + + +def row_group_stat_single_value(row_group, stat_key): + """Convenience method to find the min and max inside a statistics dictionary, + and raise an error if they're unequal.""" + if stat_key not in row_group.statistics: + raise ValueError(f"row group doesn't have expected key {stat_key}") + stat_dict = row_group.statistics[stat_key] + min_val = stat_dict["min"] + max_val = stat_dict["max"] + if min_val != max_val: + raise ValueError(f"stat min != max ({min_val} != {max_val})") + return min_val + + +def write_parquet_metadata(catalog_path, storage_options: dict = None, output_path: str = None): + """Generate parquet metadata, using the already-partitioned parquet files + for this catalog + + Args: + catalog_path (str): base path for the catalog + storage_options: dictionary that contains abstract filesystem credentials + output_path (str): base path for writing out metadata files + defaults to `catalog_path` if unspecified + """ + dataset = file_io.read_parquet_dataset(catalog_path, storage_options=storage_options) + metadata_collector = [] + + for hips_file in dataset.files: + hips_file_pointer = file_io.get_file_pointer_from_path(hips_file, include_protocol=catalog_path) + single_metadata = file_io.read_parquet_metadata(hips_file_pointer, storage_options=storage_options) + relative_path = hips_file[len(catalog_path) :] + single_metadata.set_file_path(relative_path) + metadata_collector.append(single_metadata) + + ## Write out the two metadata files + if output_path is None: + output_path = catalog_path + catalog_base_dir = file_io.get_file_pointer_from_path(output_path) + metadata_file_pointer = paths.get_parquet_metadata_pointer(catalog_base_dir) + common_metadata_file_pointer = paths.get_common_metadata_pointer(catalog_base_dir) + + file_io.write_parquet_metadata( + dataset.schema, + metadata_file_pointer, + metadata_collector=metadata_collector, + write_statistics=True, + storage_options=storage_options, + ) + file_io.write_parquet_metadata( + dataset.schema, common_metadata_file_pointer, storage_options=storage_options + ) + + +def write_parquet_metadata_for_batches( + batches: List[pa.RecordBatch], output_path: str = None, storage_options: dict = None +): + """Write parquet metadata files for some pyarrow table batches. + This writes the batches to a temporary parquet dataset using local storage, and + + Args: + batches (List[pa.RecordBatch]): create one batch per group of data (partition or row group) + output_path (str): base path for writing out metadata files + defaults to `catalog_path` if unspecified + storage_options: dictionary that contains abstract filesystem credentials + """ + + temp_info_table = pa.Table.from_batches(batches) + + with tempfile.TemporaryDirectory() as temp_pq_file: + pq.write_to_dataset(temp_info_table, temp_pq_file) + write_parquet_metadata(temp_pq_file, storage_options=storage_options, output_path=output_path) + + +def read_row_group_fragments(metadata_file, storage_options: dict = None): + """Generator for metadata fragment row groups in a parquet metadata file. + + Args: + metadata_file (str): path to `_metadata` file. + storage_options: dictionary that contains abstract filesystem credentials + """ + file_system, dir_pointer = get_fs(file_pointer=metadata_file, storage_options=storage_options) + dir_pointer = strip_leading_slash_for_pyarrow(dir_pointer, file_system.protocol) + dataset = pds.parquet_dataset(dir_pointer, filesystem=file_system) + + for frag in dataset.get_fragments(): + for row_group in frag.row_groups: + yield row_group diff --git a/src/hipscat/io/validation.py b/src/hipscat/io/validation.py index 51339bdb..b62c8f37 100644 --- a/src/hipscat/io/validation.py +++ b/src/hipscat/io/validation.py @@ -1,5 +1,5 @@ from hipscat.catalog.dataset.catalog_info_factory import from_catalog_dir -from hipscat.io import get_partition_info_pointer +from hipscat.io import get_partition_info_pointer, get_parquet_metadata_pointer from hipscat.io.file_io.file_pointer import FilePointer, is_regular_file @@ -13,7 +13,7 @@ def is_valid_catalog(pointer: FilePointer) -> bool: True if both the catalog_info and partition_info files are valid, False otherwise """ - return is_catalog_info_valid(pointer) and is_partition_info_valid(pointer) + return is_catalog_info_valid(pointer) and (is_partition_info_valid(pointer) or is_metadata_valid(pointer)) def is_catalog_info_valid(pointer): @@ -46,3 +46,17 @@ def is_partition_info_valid(pointer): partition_info_pointer = get_partition_info_pointer(pointer) partition_info_exists = is_regular_file(partition_info_pointer) return partition_info_exists + + +def is_metadata_valid(pointer): + """Checks if _metadata is valid for a given base catalog pointer + + Args: + pointer: pointer to base catalog directory + + Returns: + True if the _metadata file exists, False otherwise + """ + metadata_file = get_parquet_metadata_pointer(pointer) + metadata_file_exists = is_regular_file(metadata_file) + return metadata_file_exists diff --git a/src/hipscat/io/write_metadata.py b/src/hipscat/io/write_metadata.py index ed4c44cd..37fc61ce 100644 --- a/src/hipscat/io/write_metadata.py +++ b/src/hipscat/io/write_metadata.py @@ -10,6 +10,7 @@ import pandas as pd from hipscat.io import file_io, paths +from hipscat.io.parquet_metadata import write_parquet_metadata as wpm def write_json_file( @@ -116,30 +117,10 @@ def write_parquet_metadata(catalog_path, storage_options: Union[Dict[Any, Any], catalog_path (str): base path for the catalog storage_options: dictionary that contains abstract filesystem credentials """ - - dataset = file_io.read_parquet_dataset(catalog_path, storage_options=storage_options) - metadata_collector = [] - - for hips_file in dataset.files: - hips_file_pointer = file_io.get_file_pointer_from_path(hips_file, include_protocol=catalog_path) - single_metadata = file_io.read_parquet_metadata(hips_file_pointer, storage_options=storage_options) - relative_path = hips_file[len(catalog_path) :] - single_metadata.set_file_path(relative_path) - metadata_collector.append(single_metadata) - - ## Write out the two metadata files - catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path) - metadata_file_pointer = paths.get_parquet_metadata_pointer(catalog_base_dir) - common_metadata_file_pointer = paths.get_common_metadata_pointer(catalog_base_dir) - - file_io.write_parquet_metadata( - dataset.schema, - metadata_file_pointer, - metadata_collector=metadata_collector, + wpm( + catalog_path=catalog_path, storage_options=storage_options, - ) - file_io.write_parquet_metadata( - dataset.schema, common_metadata_file_pointer, storage_options=storage_options + output_path=catalog_path, ) diff --git a/src/hipscat/pixel_math/healpix_pixel.py b/src/hipscat/pixel_math/healpix_pixel.py index 1b91e84e..fe317f82 100644 --- a/src/hipscat/pixel_math/healpix_pixel.py +++ b/src/hipscat/pixel_math/healpix_pixel.py @@ -53,7 +53,7 @@ def convert_to_lower_order(self, delta_order: int) -> HealpixPixel: if delta_order < 0: raise ValueError("delta order cannot be below zero") new_order = self.order - delta_order - new_pixel = math.floor(self.pixel / 4**delta_order) + new_pixel = math.floor(self.pixel / 4 ** delta_order) return HealpixPixel(new_order, new_pixel) def convert_to_higher_order(self, delta_order: int) -> List[HealpixPixel]: @@ -77,6 +77,21 @@ def convert_to_higher_order(self, delta_order: int) -> List[HealpixPixel]: raise ValueError("delta order cannot be below zero") pixels = [] new_order = self.order + delta_order - for new_pixel in range(self.pixel * 4**delta_order, (self.pixel + 1) * 4**delta_order): + for new_pixel in range(self.pixel * 4 ** delta_order, (self.pixel + 1) * 4 ** delta_order): pixels.append(HealpixPixel(new_order, new_pixel)) return pixels + + @property + def dir(self): + """Directory number for the pixel. + + This is necessary for file systems that limit to 10,000 subdirectories. + The directory name will take the HiPS standard form of:: + + /Norder=/Dir= + + Where the directory number is calculated using integer division as:: + + (pixel_number/10000)*10000 + """ + return int(self.pixel / 10_000) * 10_000 diff --git a/tests/conftest.py b/tests/conftest.py index e07e6c76..58595a72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -216,6 +216,32 @@ def source_catalog_info_file(test_data_dir) -> str: return os.path.join(test_data_dir, "small_sky_source", "catalog_info.json") +@pytest.fixture +def small_sky_source_dir(test_data_dir) -> str: + return os.path.join(test_data_dir, "small_sky_source") + + +@pytest.fixture +def small_sky_source_pixels(): + """Source catalog pixels""" + return [ + HealpixPixel(0, 4), + HealpixPixel(1, 47), + HealpixPixel(2, 176), + HealpixPixel(2, 177), + HealpixPixel(2, 178), + HealpixPixel(2, 179), + HealpixPixel(2, 180), + HealpixPixel(2, 181), + HealpixPixel(2, 182), + HealpixPixel(2, 183), + HealpixPixel(2, 184), + HealpixPixel(2, 185), + HealpixPixel(2, 186), + HealpixPixel(2, 187), + ] + + @pytest.fixture def association_catalog_info(association_catalog_info_data) -> AssociationCatalogInfo: return AssociationCatalogInfo(**association_catalog_info_data) diff --git a/tests/data/pixel_trees/aligned_2_3_inner.csv b/tests/data/pixel_trees/aligned_2_3_inner.csv deleted file mode 100644 index 696dc40e..00000000 --- a/tests/data/pixel_trees/aligned_2_3_inner.csv +++ /dev/null @@ -1,11 +0,0 @@ -Norder,Dir,Npix -1,0,33 -1,0,35 -1,0,40 -1,0,42 -1,0,43 -1,0,44 -1,0,46 -2,0,128 -2,0,130 -2,0,131 \ No newline at end of file diff --git a/tests/data/pixel_trees/aligned_2_3_left.csv b/tests/data/pixel_trees/aligned_2_3_left.csv deleted file mode 100644 index a7347b36..00000000 --- a/tests/data/pixel_trees/aligned_2_3_left.csv +++ /dev/null @@ -1,13 +0,0 @@ -Norder,Dir,Npix -1,0,33 -1,0,35 -1,0,40 -1,0,41 -1,0,42 -1,0,43 -1,0,44 -1,0,45 -1,0,46 -2,0,128 -2,0,130 -2,0,131 \ No newline at end of file diff --git a/tests/data/pixel_trees/aligned_2_3_outer.csv b/tests/data/pixel_trees/aligned_2_3_outer.csv deleted file mode 100644 index 6902c1b5..00000000 --- a/tests/data/pixel_trees/aligned_2_3_outer.csv +++ /dev/null @@ -1,18 +0,0 @@ -Norder,Dir,Npix -1,0,33 -1,0,34 -1,0,35 -1,0,36 -1,0,37 -1,0,40 -1,0,41 -1,0,42 -1,0,43 -1,0,44 -1,0,45 -1,0,46 -1,0,47 -2,0,128 -2,0,129 -2,0,130 -2,0,131 \ No newline at end of file diff --git a/tests/data/pixel_trees/aligned_2_3_right.csv b/tests/data/pixel_trees/aligned_2_3_right.csv deleted file mode 100644 index df302ddc..00000000 --- a/tests/data/pixel_trees/aligned_2_3_right.csv +++ /dev/null @@ -1,16 +0,0 @@ -Norder,Dir,Npix -1,0,33 -1,0,34 -1,0,35 -1,0,36 -1,0,37 -1,0,40 -1,0,42 -1,0,43 -1,0,44 -1,0,46 -1,0,47 -2,0,128 -2,0,129 -2,0,130 -2,0,131 \ No newline at end of file diff --git a/tests/data/small_sky/_common_metadata b/tests/data/small_sky/_common_metadata new file mode 100644 index 0000000000000000000000000000000000000000..a1505a289449f1419095c4f6c9dbc9a7f9262a58 GIT binary patch literal 3997 zcmcgv>u%ae6t>!`+ODcysZ~})swWBXo-RJ`xe?BcV@eJiozT z5MpD+7lNqy0zLRkH-W#YkPct`LRYIJU27q9J_cR>mF(l0A$3o+N_%cK zyO9;NymODYWLz7Y8h@=<#FAEFGBPerWkDHQm1Wsdt34C;GPD7)LA(=!2ihL5o&b$XUBM#ejkld5G)T9ADbEbX4t#0$z3l(OYx*uR5iZ_$VS%9h4g zwUbWOmdL)yvnt9bLv5EBzlz1E$M1!;7E;CjC$`D1ozrjpbVIAMrf$Zkyc4#iagtI(0y!4gq(@j*|-?~ za;|Hal{WJM;$L34-7HRHYC$gl6BfUN^>|-QR=iz#^ew42xG64Mcq0RH2tJkcRgElb zX9B6utp8%q(3IFZk!c)qwIwLA-mPIfQ0BHraqq$wTER_m!kK(-)|VA@r&bf{sJT%$ zBdEu3;mXPE%Hc+>1l`DP;_TVT;{L99z4vP|ob7mDJ#Fy!)^;n=-(B0SYnF@es_juH zO!AUHV{6{NoL#qAPIuHc$yti?yK;7JOZaqeTmn9H**W?@u=Gp`FYHQt;E1&l`U_5M z=VP?1CzC$8D|bORTQB*TZSr^4lLoy>_S>;KhKzejs!X)>3pQ5 zXlp--QrG(dFwm$jOpOidPpLd=P(n3bCwR!9&f_85D$H^g~|{EYQJA!`@f_@ z+xV$zn1hrF_a(UB=;YqSo<1ZU?0^u&$wqZT1?N{r_xMAvN)jG{G`&B8KDeB2n)rj> szU2zPhUVnc%#EJgVfdWh2al2q-x0kD9{%-&@~ literal 0 HcmV?d00001 diff --git a/tests/data/small_sky/_metadata b/tests/data/small_sky/_metadata new file mode 100644 index 0000000000000000000000000000000000000000..63b03188f42a9fb1b4446d058e8ea678f6837297 GIT binary patch literal 5130 zcmcgwOKclO7~Z5IN=redsVzsi5V>?|MO!}-w{=mfjGfrANu0!O{X!`0%l6v4YsYV= ztyFP9RZpA{LKO$j72?1lLa3?`>WL#l964}6y&$2ghaMvSnc211UMDS4VAXDR{>S{^ z|Cs+Dc?~7Jwy~)Pe#S9g6AF^b*^V=|vD2fDQCVbcNAO3LqaTMvNua!mDz_n3tK}+P z9tMW79tDl@c)2P{z{HC~3$p$-7O%){s`2_pL9WyUuD&5_qSQX^Tz)V)f1F_$`=_H* z&Z+y3x7~D)IgcCk;f;0Yyxt2HuDaQf>h^m_oq*eaj2WN8P;H~2X@5Ft|Kuq9`vg1T zbWB`51pC3`nD}9oe%YBbL#WKuIop^Gz%uYU27f^1h326u`p?`_3!uXH)Dmn*rcX;E@jC|c$rJEz!RPdOc9IQR4oAuz4><@)x!J)8K9nGOEPvR|?X zZSdLIoR- zTvw()boF;C*GK23U6f5*mqes)x-?FeKwfJo3P|K+Nf9^tY1GO_y;H%0u7)NHWl?g0 zwAN57oi3xy7bT&N>8e!cM6S+(_)V9jfw-*YvF?rQ;q#YWw{Q2r^rLXeBG&Iiq0UDdM0Uw9@Qsd6tHthA#jFD~Nhhn-UXN}yHsizc7XK)DFaMx3SuN?r12luL2Ua@< z^#BbGv;)g2v+@UwH@aofAI%2e^a?_&WzY>?imFN@ z8%fTleeov0l&tgGio7ZZ`KS^obLrq##uxUaeN-l@cxus5tjPP?UOwf|aj;i5T~LHh z^kOtzhrRp-K6L}@H^fxCwj4roR#g-}lF0FDxSqWRauL6mkF@YUI+l1&TvA$u7m9g# zRe}8zN>py;VLvaQ4*JtG3BM4*`y!e5#Wu64xQFt+iWmulKZ!!LSk6nGn8K^^W;OzN zF{V%~nOw}&S3|KtF4>2D3(KcwYH}A>C|1daq8OLTrINL5IMB$Xyb6|s7?p@UEm+{y zL_zQdJW@Jn@EYPmM#p2t3o-J%KVk0pjm%Nl7!#Y>kIp{wpkMLdkS?i%FL^ zERQN52^98<%K+Yb7IFwac{1G^X$O`=xIWGPhjZCKu;ENZ;aG(PHMbT|rV>GNM<32eocUsb1FV>j#^M}?`8hTSy zjd!xCum_(x=<9GE+m#ITC_yWcehf7V-?64h`-Bh9=N6X=G)UhpmKrOe*;Z5|y|))j zHQ5n-N|TpykB)YhiwNXm0T@s*Jg=H~tS#kXj0IQlYzyOc9M88<8S^FlS|mdc7_eTu z=z-_-hKcKH2f_tO$YU+A&(Xb z!EdZ}Ja|x>@xX`%eK-sGg&Xds+fCTl$a@`M4d5$?3S^Bz;dyR61d4HPfw-v4BSiFr zXh2`UiY9#$U+4+Nv=4Fs{Q>h=*FcM0A1G4MSnK%lMa7IC;syDM70i2}$QEpJ8FPX( z2}OOqxImIbspJz2Aq0026okIGg1is|Vn4so^S@1m2JsWqFb4?}?n`jL$>4hvee#fW zumeI6Cjl*BDmdRYx{E*LsuWR5AWiO1pbst^o9h0cxjFc;O8-slyilo}C-=c={lYg* WZh}+yoeZmqjXk^tMWg+Yw3>P&l~AHBNLA!#dGqHvv#?1g zWOm+T-eZQHA{&P;NWgNC_bDR~gaZ!VPKW!>woToqHKsaruW4958 zljqA?X#cg2IgY^bs4{R5l#mH-fCOHmSSf)KR~~T3Ct-aO;#cyN`3Waw<-M^r0g06& z7o16_ArrE|F Vob>v=zH`+PqJG5ho)A;IxPNU`aUcKy literal 0 HcmV?d00001 diff --git a/tests/data/small_sky_order1/_metadata b/tests/data/small_sky_order1/_metadata new file mode 100644 index 0000000000000000000000000000000000000000..dddd67547d7253086e8c653b0b37c81cbc0c4314 GIT binary patch literal 2202 zcmchZZ)@5>7{JdMG98Xew}b?xvx6Oj)<3DW)zQAV=;p@yZ_;jb?=%ZmwpN#JbAAC4 zDg7M#1Vf~ZzA7SRpK15RoF`geNWnwyo_j9$; zS}R2pv0Bmo_QU$F^gLmDrxcQc8Y?afilB5n%C!flu`jIpJZR8(q@WjfWRbQii!zc0 zM7tmz5SkH2Jw*&Lk)(di$ac!Li+*+V8 z(V;dLfi@QN^{LvLe()yD?5+2h!%ceYEaKMUJiXm29B0him5eXsd3x(>oMqfwSH*3W z!o-H&SOnf!%-5%SYb}B|ab|D*Fo&D;)(zoyXrA8wJ>@uK-u^D(%cXgGyM4x4#=TuF z(!y$h6bkzvUk=`iYDb7eTOrWiJ3S%q2&ogJOkNXG zCfTx+H4H*_A<7c+7DhT}Bb~vLu@3ot=$um7FHo9J407TmkD;ggSbGJ*;h9mhoJ{t} zy`GRGNaWa@?RT;x`*UzrA^){Oc5`%V8m5MG2zn8Pr+ml6s1La5&Di-^J?CU2_3}{Q z@HEr-l>gA6A>Chx^S*)S;I=`1vb~Mei2Df;HEXqK(YCAg$WO~YuT;+>dNP)XCA6(D PLW3W=q_5Zm`c?8Dg-w1p literal 0 HcmV?d00001 diff --git a/tests/data/small_sky_source/_common_metadata b/tests/data/small_sky_source/_common_metadata new file mode 100644 index 0000000000000000000000000000000000000000..aa1aa280c7a8c36565679e0d272c45d586ad831e GIT binary patch literal 5506 zcmcf_Yj4_C6tz*MRoy;pVoIuhh>-eeZ38B>rBbC`8!)E9Noazxrz!FaFtLpb=2_(* z?e|Rku%ECWv2(7C!Pum!l-2^{d)}{m?lEViRmkog{8!5CKN{J_c0PxTmgrpB0wU^000j51ob&cnllV_PsD0Qd}Ox<5g z4cC#>a80E?011!8;m9yYnJg|IR;=)8PxP#LAo8cHzU7T|6@L-Q28=iCdA2QiEb*#>7!uHd(I!`Z#G(1l}+R9ld!0c7;x^ZHf^I4Mx)M%QOtjOJ3S+1( ztx+|`w@n1Tty)64X+;`CZK>R-#kkT9#2wV87HO#W+Z1Wg89x{kNX*c zZ}Sp*Z&FhTyhkhG>P9*K8b2F9?dm;JDjwkk4)7cj%d@jEp4QZ z7TEK>g*qAyXXyFV9F47^i_pUHtNg1%4mxozRK9^^09VZ2WcHtsR_j!H7o6Yw|2#dS zSFb?`IfpM^9|$=nY>(6k=@G6+VJsWN&k$H$<7f3?4R`azQ*K(&xh$!>0ej_;cMB&@V9Y&mL^|r6MP3zQRHNh?q)egIqnOeTX1T=!39}8S# zpfU4;CY8D>$Wl6eTVDvTg_;ktN_|bbME;2(&BrGku@uL)HNMr=oSLtE1UkM{(D)h3 z32hO(M%|uK9M>pljt%l#wqVVAAg`cx*ixt1D(O7RC6>lCJ}Huz5AX%s;A`Mdt1k?O zJ#*2pHAkE(JlKmiagBDnD*LBgqug!BY+oY1R2*B8U0h?S726A$X@g(qR;ukOG&)~{ z_|=x1HbPwT;vkTrxouys+nYXmntRTOoPaU#6|f!n^KulMdHICHysTnLES)JAbVsy1 z#YR8yTk#ZWe%)bip>f&1=Ct~Hp`156td0H(jxyFDpSRfNXneVEROHn7f-hsO4soof z#|`Re@J0I|xAKDeo|gYD{mJ|*&U`9&zAk>6zQZ0X z%*^&J`IXvvS!(qSz8dYx?#CJKlZh-9!kTD6o$Rc43;P9)U0QtH_C%F%zg<#)QfhrF zJDx2U!@Y1Pd&-vTv?y$>aa5^2R%+!9n-er?wY0gF^b8xA~nBgU$j@#^7?K1#Hm(nzur&zug^_w>3r zfb%z;1p*(^a%V`J`z;x>I6i)^2j@<~z%yAcHf+#0)h#&R@f_l9OLuAu6VFeWDNL!h z(4BHIh?ipPuIOpZF`XTn7w3~EcRUwdIt#3OhSOf?j6Ky*CMPUT24vD)G|DhAiSUI< z3dX2PkAuf5j#UhfVIO8NX0dN^_G;mp#j&fRBmF15)p7n}0f8vEniu7xqqp!-L)B63M+}G(pjy~1m-tcYntvhV`fSbrCgKn5Pw0>HdkPvQ zE<^oJRLjy_FH|E2DEFX;>0VfaFJ7`&<-{0o0|-sO{-W;2;H`1kdH Ddy&Qe literal 0 HcmV?d00001 diff --git a/tests/data/small_sky_source/_metadata b/tests/data/small_sky_source/_metadata new file mode 100644 index 0000000000000000000000000000000000000000..f5083342cf4bd6d3cf168ab55542ef87a282a33f GIT binary patch literal 29243 zcmchg3tUxIzQ@l?IH2aE*kR*s9OkxqSqi90W^q74Oz@HMn))$9ovTwyVp_5}`l+}mjf*q0oE*U~ zI}7WhqRo6jAHP=ER39G^qj8x4I~gJ)5~A+VESs*;2uDqfTpMjpm5(YcOGg!!=}~oX z1c#u+tZ~*wkofpozKPe)4eOh!XH#BZK)WKYh!{lj9v0Zdr6jx&8`p1!C z12Il_HHNR&!eKQUS7YnV{5MW>mz}6)+DsRN3s}~`$7J{icDa^^zDd(voJV*_ju1T@ zFp~V6NSz~?hz?tIBIScLZ&zc}Uv_?YzB&kOmt0?O)%$|agpUps;7X9*xlO7 z3*SFvbv5Q4dHJn|;DuInp^U0=(eYc^1vV;1e&{!(;#VN=`RF&d-0E@>Dk12|%GEz- z_0URCM%BpI-^a>F#4z$*_mW+Afc$O8Z(DnF=NypFeC?sQvfJ;c^2(?h`R3WvT_y+( zfu;}Duh()7TGBC#WY3yDPji=cldCpH&C&BcnWiq~B9*Apa#k%Fb|;PQP{i?i+SE1j zH$1`0s2USR^W++Kk~#p!M9y5&ItNTp3suFrn(m?#&bH9iQ3pG;T#=R(E+D-RPIon6iTO`ZW|q__);=sN zVt>lWH7_Ll7v5mGE8emk?X(%a9B*>^5zxaD@t;^cc}Y$22A0STd;}S&@_-}P;Xsc1 zlOi8XBy3Ex&t>X-@Z@?oxb4^Y*iq#UJjg{6BiHi4EgB5~<<}xm+--pJfJBt;&$O}) znWW&L@(ymtIURTUGcDN*XBe?!k+8CHii{Oy)Q+)hJ2_KjklVTFgn~tIvhSRZlw8%5 z>d&>{#d*)Y(_WpD3K_TO^+Ce~*A&{bDWj^qbiCmxd$HwFGOEf&%P_bY2_3Le zIf=7QBE3IMAca3n7?V5K%!o@{Co&nqk_UA1#&JakQoea2X}dZB3cFB(PYn*9;0JqL zWmHvRhjYM*G}L1815PgKxoSN*l|7NvZJP*AsJIdHfqoP6K^ax$WZ(tHi7kV`O0%9T z)06h+Cz7UI$x5adWCNA==Cgr*w~4B<(!B$$C_w}^3bkB`mR#CCk(}5LHVoK9@t@dP z%S)bK@G_$*B1sa>e0d^iEkMy(R1`n4pz@Nc zf?jza#`oeJL&+tO@9xxbZDl&ro;8V-yfRUQ(}Y>xTn(ObYfJB8m!o#Xxy<3ofEB<~ z*ISc#KFtvMGR4qWAYuJ<>jdm4pPngpjl22RST9b>N7 zwvqb;lZwKGlER6%abi9hdW=|q3yFBkJsd1v1 zIwu%wKtD{lEKOr48X#AN8za+kCuw@^oFUPdd}BWh$WE#cEa8C7ScqnNQ` z3nr-lQ(CTFOKM+*R`9CiWyVSoOuy&zfqJots`JwHI(Si%34HKmYTKnF6}!NPfnH~! zpV(o`OX_rscQd*Ink3P^Z|KNS35sGde<_---5fhg8BviIzH}iw$$dqM}mBr&)Hhd*eN*Pt>rt^;ua@7cwVr*o?Ngt@gIh@ApnbhT!Q}?xh zQ=cx(o|I8_Hg;thSsRp7jE$Nzj^yB*4qzkiVoa!>lIjyfMZmi{o>a=HIvY7Vz{W^Y zfsZ0*uGE>-WE)A>Hlr$1$&InJ05rrT74bkx1wtKqu24_PcN$6Fi+D-lL5Wmt45^e+ zb;a!}VrpHqBIdyIYISVQrSe+dCA@lUX-K)A3UjcykR8WCE$Z$lX_CgEu*QF(G^G~iMH)A zl8QYjibcL&QputPZ!)4HsU+2sy+$(dM^t^As^TXWRbEn8#YJB^$w{Cj73fz$x7=Q# zC1;K}k)vfa>EL8%Ho4|pej87__xh6$KlrvCb*|x4XRhH^U{%gXPGhrbzBG**${USU zbZw zkN>uQWeTt=dhqxy`l|JmRm!M3GqsmSWfkLQOQV5QY}SLDA&&!dg8%1H$jkjdd^Pj3 zz$#@_otypVl&oTGT(CNj;(lb6muGpTD`nO8r%v4+C$LHxRc9lsmFcR4ITd50?j7hc zyO34KpFB|aH+rd0=XcNEe&JG|GOEr-b0^ps$tv(M=*(q}CuMC$()g7+R>_UA=salH zWfce&8Ms>;u07{e-MR9$g*_A<3@!z!@SrRVnPNmh@MT)2Q%u#42d zTuS&RWQ8)S&Pw$q#)>VgupQ{Q79HvCGm_39BrjIX394_yDmJRl%YlD@7bUB}2hXa? zez@b@j~%!?k5bPnxn&GoW^@HsNursTjihw|MX|`&%PLv4dXNznStY5~{bVG&enQm> zs*0ajRC!5V6`wu|HyFeuP_hd2PeHdl4EJ2?i=4@spCwjp{^yfdD=4eHt_FHl*iq;D zYsPbRaDk3%+V4CztGpyu)fJ7#s>CF*!jpZSd?~9|Ei>M45m=>+sF*{snK2738iT?@C#9M|$nX zB!N}Rs5%=(wP0f;tH8(U@!W;+BpZf!2P>Ua&8y_bSo{$)q!a-`=T#uoX5fbPq^-`0 z?E26NiuVX-Qu{gh zxFM{PTgLt`8C`)@lIWopCz98KqFBu9WtA-2i!&Utup+A@)hn$|q^T8E-=V7biL8>B z)KzicpU02ID$pOyg4lMjh}+ z>IOw0G$DB(rN1XNFu^N@XLO8{ix;eU_D8(V@nq8*t-8DT=kk%rlB|^X zYDvK^B<4+PBDTMk;s^P-ukcIHM;-~Hj8aC`c{$L*Of>~YNhZpgbR>5eO+>Gl92-Ch z^}F^D((?pDDWmF46j+%tgg_|CM1F;yT>2KTg6=){*s?GGiFc?57Cim)w*sG(QFSH; zj)RGjd;%8*$;Sy9w)O1VN{e*l_M%7uV{fx0<%O|LR@B()S z3!3z#@>9u+74w1Wd-M4~zuQFBc{y|nyog=4{16}dpwCX!e6A;jpM#GZ!6&9=47V`4 z0-q$&oEAMf{RN6*F|U(PjA-pwjHt*bNwuL>PxiK=>LIF%pU5Y9NnI5;?t=?TViG9l zQlQ`U6xSI@`rt|J3vD#z@Kpbr(+wTMlb(6&U$&D&cC@*=PYqm&GtjA}&@eWg+$A~{ z1j=SaD$tC~zlj^%s^#(m0SOdODI&JPAhfv*7;JF)cYA z1PS@bqg!u(={rowW7{@wpE#2TlrpN$OH(TY1zSKdW;*_=BekcH4j#32>+bCLhLpVP z_|MP2cg&9>N*Pt>rm})_#pYIonfxe2#``n)!E4J zWUhk>kYa4?`piHwPh$sl=5>eH&(hmdALrd2G)d3jo>E5D*(mP;8zV6VJ{ldkRtHkv zWgxBR45~0Cx5b_xm<$!!ASOktj;qpHSIQCN!;fF=Z1~XXWT+#)>Vbz{_sxC2PPy%KjmF@ivK=0yN>J z1vIF!iK_E*a1gvGF$F$&OjZ44AUl2nA2$S3a?7|f#OMl`l0>%*8A#jDD2k=LUQEfN zN3SxXBBmtO=a5d^ zqu*UTjGXdz4t{#pVLRen*FJa(<0jx#YumW7Ipra7s%PCOoH}|lw*`LPqjMeUTuwQ) zLE_Z245yS)JI+SslIs4FbkAXbzY8s%aFmAQ;eBIAHq21{}2#!gBHHNu-_MQ(s^Ra>TM4%r8%jLs&jMT zRp#QOz$wPY-hUcM!FLD>69|6&c>l{z{W^UfsbY**KQ=`UpSD~Qx2*)CAY<$-+_iW zk`&=UIR!$wdaexyDBr>_AavlJxVaLitPH1=QFX;FVDC$cR>ZWZ*Ky@KQh(Ngbe=&g z*gb0YNtgTB$0%%~>a3KVXBvPlr=VKk$z*uQsr`Eg(nW7jQZL?%L{540IiZZI^HS6U zUX+{yA3Uc@|K>mjFF>8#5KhT0<4iB3D{x8@?YZbc>Mo)v7V~;JC5x8*$cT!Zl2pt4 z97s+ds#a1}{6tR4OX{k);yJh>CMJQBQ=s2=1n^L(CCwKdN#iA&bcm+|Q(fPwq!X%T ziO*;&?Wl8=oks5H%>b%>mmS9jm8S%%BC~8q1y)TRgDljqtMsok8C0YGi(isIQxqnjh*0WZ6#qXuFXPk<7P${G8yfjpe z3M$6TnG$$F8J?4NHM-xjb;gD7@t39YGOFKx@R3CnRLZD2HwVu$16cu7jE%aF4Pmyw&XMYlrW)UY!t!oYF~jlm#fhncJ_-a^txZ1`*Q6| z0;rTxbvD{w0UIMh1wOKjT)vU)-QhqQwyOh`+!m`|0}U}pMLLK{!Go&%RR_{hgmidF z0@Vi$sFYE4#ch0psddo`1r?OJOUKp0li(!|B=>c+g8idL{+(SRE0j@nR{GfIPHjO2 zUix($Tz)Pqb%1-dk{9NeqxhVlUTmW3yqqb6YEgm;eDI*^c@vm{PoLfpP{}Rhz@Hdh z0aTJ`&lDaD1R|U7j#UxOI z3iKNg56^2!SBWDj`m+eC%wwf9&mEyNDy_S(_J|#IuJ$=2*EJe0xy)&2fKH~?VdhX+h zXJTvki}A{+IyVEI${7`7jk>*#B5Z!J~O%G8}`DILcC{qBHGOEr-=Q}pfanKnR zW233SkyyKMM#U}o?v;<|j4F3?)`RdF+c_S-3?lNwuE7g^HG6WaoySE@7`ky&tO<#}C;C|6+ zoZ=G8KkcWCsxwpmBb+Dk!8Ti132v&uP3~Jpa-|x(XODX?PB@r~x2Xnni3{!?2%xhn zWmKIT>z{1i{y<-K#Mmf4JD%kH9UlhnSr}3DI|{3-x9-^*FThF}RcE92JLct6!mNt1 zvFo)-P$T$WhRiC%x7id{C*Pj<+X!J+rHrbx(e)3oF%nkbqiHJFHkEXqoka4^PErLc zxiRhWbU5b+X(PMJr-n9H$Q+ z?$|t;^j<|P&{M$Ee$tcaLRKiF>Z}}kja&;s zY@+JCl;wdJC9J>)53BO$CzG7#!N(1OmE1D=@)=zLR+8x83zJFn3n+?3zFt_#qK95& zL`7Ihsx><%lfoUS`Yu()PlT1cq^^oj{4`aDgK|~{`h$n{TyGcrvd}wIN&C*p!hI@l z-}zI!-^J5z-H@Uk_pTjvZr8Mx z?Q~Lg=U^^qV`s8Yw3!ZQ@_>&S_tI8Ku1OrJ1tQbyGkx4xFCa~n8;m3;cB-_8%B@1LTHfU;b=1QVCctKL6b6S$JnT7 zl5G_4Vftaw2OTT&J}RO}-A*m(eh9Mh;S!IGJ6}gQ{Wf?*?G)jc7?n|VPIf)ZW~2NL zV0b~0Waao3ExAyNee#LyBVU)$*ThapIh^qQF<+XE%BVUor57150-Yoi4RC>_F9*AX zyTbRh-%Te^FE)+aCq6r&jH)wH3{Un-GfII@l8HL_HJR>SG!glc&z!gE)M@6ycZ>QA zJe`zLbtVeonO6m!z{P=aT;(`Y^#wc&3=b*K8{@^Xqr97jNrj`|Fo`K5fezC^sDjdI zOS_&Fe~q^RIce%t%g{*~Rae}D@Jx(U=b{xc9l*+NxVHAKp5()`J_3{E>wH2!U{}dp z{e!4FE2ZZcE4Fl!s-^p^o@AbtyvQ$&wetBuz1T$6c`4`yFG@Or51vjX-|NXh7x=g# zbdr0gjth*gKqpDG`@Ei5&!Z?7^LptdiJ=yU@80W;UXlI*B3kQ2S)gXo!Cw`5klhfUF%w12JfJv-Xmj`IfSL4i)k57GzG zXSzBh!OPPlQe0am{=tpD+WaYVq9rXpA;s;%8NYM0ee1c~lQU{u*I6VXiL;nwlQAisnm=l+|Vs8A_^oV*4sfZXm zQhe|ROX1%UP-=MQwwzLUtWv^D&Xr0+@CM4@AC*w*h|wD; zG@|EM^rXu5BgE6BHfdX%a(rZU(hfa2IwCbOnRfb?xX9#)=F(yBgDaPa*Lt{QZHtW+dxL97FzJyG9fT-d7MvVfW<2@BGhM7 zn1|o&P!E2a0L$!@0NbJdXRPMTKlk zS>|hs3y-&0BK=pbiH!G44Sx)_^PlS;>7S0r5xfMgiCSVwr|W!|x<|%a;P|U70deUu za6I?OP@lP>o~!1X{qeY_xJZws>%&8WX7lUaEm8h{P@Yw>0c#UuqBE9SBIAQLg!_YE z^yIrVICybb>T=(uUTcDFyl=woLp)RBBwxNuH${2|XM}kKL%CMQ+4XK`8?(P3)Surb z)+5x*?4Y&$F(MO!Q_V{(Y#Eax6IOZz`37Jeq=tic*nf7IR5u&Fmigj&5$n~a9TtN=G)`6Ja*dphOqb~OPHsS3%|&ZS6oDV@Y*OpucUB3 zN8%$wf>Xl%yco}lZS1urHqY$Tl_l+8Z0AB*R;@9|FCOjuDYp5~okyuo)`f=zUelkH ztM<<8zo(yA9_hIhco8s~H=FhL>J9tVz)k%AcG?=A?@fVmzr0^v*l6MgvN|oFJ^zsoKbTS_EYv*;-RtV8&7} z7%&CF|1gq*Vbr1+U--HhhgBXp40|*khFL!IaqP7Umwj;9waARyug8xiIR5g16}Sq1 z0#|#@nl&H3EcT^Aevn7M;%XfK!IoW3?DO4*V+)3K!jL zb37P4!Vhyi92EL2$F_qjOv^V3`X(PbC|ophe>#X{_J=Vd_>Tw;F!`@|VAd>t_(?@V z#pRU&bSG*`8p^_6xYgph8_eL}4BsCUw>K@O`>z$3R|eAkaUd&}3ct~+@we&Ep#GK# zoBF;NZCTV<$V>8g6E$zQ3aK!%f{4x9E z97f(B#-}*XfJE`<=Nq^-01N#%ZND)lsA^hIv@&q5oDU<*vkunzuw!!E7@BT69v_ZF ztze8lAEt#1HLf#+zjb(@IVtHrI$@h3Ovq;NQ?+UB@HZNN$$y$z$r{ZH__LA!5A7%i9`=syMCcrUN3<^!SjH9bq%o0dr zun6@bNm&#y5mflgr4FGk8P@QgOCA$~tsr12gLF0L)s*e3V+@HeFDv6al)Pv$8*u^g zRZb3PJW9eiO16-~%p6kF>DBOP!EwiF2HVjW-B1Bk02M$5Pyti`6+i|4e+5)I>63wT z42=$PsL`>_MqyV?7=;lND@fUB%7HIaU7QwN%G1AEKeumRKWFuLtEPTd_5GAb<8-3x z@gT3nLHgH?jMuK@wT@JLRXpimH{9;v+>v$s4*kMd#EUu1nP4-Xa#7tcoZQ4~mGsMa z)s$_t)FP(c^Q+@4^K{Bwg?t*+Qz^Z0&FnlHdG}JxzwHn2IHT6U>ucWG`j+3~$!1VR zDEHeqH4R(!hDBZz^3{2`BHP65o>+im`qY$dAjp2uG5T42`LkK(MKU{7Hm8#?+2LW9e+E6-PFJ7rGnp?Q zPvYq!+vO(p6hGHN?Wiy;;`yqhuFYpF6iEu1dhiGyAHs+wJa|$-l%}?k;i}?m9jW}Y z*==w@oZisyp7!N=s5c3uOK4d?5(RuIPvQmS-^;_1vTNT`Z0cgrd1-dD=eJ><+qQpp z@zeSGuU7J%#A`0tczdk5t0cRIM~PwQVC>IxCD5V*r~oSPZz*sG2P$w~f%f$!TtNj; zfj^_b-aE}b_@?ds(b|63cK?I6ci(G!@9WyW(e@zH_TgJ?_r7Sm|4G{ipS9h5e%#D( Pf(oDlr~oSPOA7n|RcdkO literal 0 HcmV?d00001 diff --git a/tests/data/small_sky_to_small_sky_order1/_common_metadata b/tests/data/small_sky_to_small_sky_order1/_common_metadata new file mode 100644 index 0000000000000000000000000000000000000000..943816477faeedd41ff200a2b58466cacf115919 GIT binary patch literal 556 zcmY*X-%o=;48BaVXxu9Xl0ER^<0ex{jEN6D*yj8(foP_Qi9v>JW&;Ke)W5+$+V&VY z8?M){?N{37Y;b$Bq&^!=X=eT`SbFjG)PMzjg$FjY3{$r^XO$exL0~8 zTGZOVL$yxsjS>zoixi>N1?jxQA6drsd>5MEcNLLl^o1!THt4~P) z86LiRJzlIIbtCte?!pgEhei|O2hvuBIn0Fji8Kwq5mCDC0WA2>$EL9xoPP-5V-y9+ zf<*RaKCzUo`EZYzRP`kGhkFbGM<`Q?EBb>Arf7+muGYav$4`=um!@%{}D=x%mHB_+a$LjRzv#KgmF`4L5gMuR3MW+_X-04+s8J#pkf zJQ`y-aNy+869*2Qj3?uP7!DkW;bc4z&&K%PwzIQBduT{@(w%)XZ)WzJ>35h>mPtxJ z^_fN#|6+F9oHL{Vk$ml5#XpiSPMXD1!#7fxc~uGBZT3x z$e$hNynt~YyOYUJ*Zjcbv6*;;Ogt9(Gc$d^Aeh7-?;!eIPv6x8QI}1`BP8On$X6he z56{V78jx50mkFs=r^AZ^ugbYysUp-~6WwZZ zQ>fpvV5%t84~c$~fGBDU#Um7o$0A=&z*I5nxP|VvxM|eetAeSbQGZ+M*H$2k*+%gQjpDJ$S2b!!L8E*%8}*(F z!qY~rwbT7JH;vl3E^tm7wcbGwuDNN{Mwfu8V$^ykJ-FefQ6J-isiILIV)SPeh~l*2+HXUn{6q~aL6*rp9KT(907EVp4uzwP(z7?$du`)(seQb^mS?k}$PWw|Z3ws9(%f`4i^=Mu% zn3x`T1Cx3&(ydANc)LGsEvk$4aS&sR@aPK@-Lp$i654WhE;F0WrN?k Date: Tue, 14 Nov 2023 11:26:23 -0500 Subject: [PATCH 2/8] Merge recent changes --- src/hipscat/io/parquet_metadata.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/hipscat/io/parquet_metadata.py b/src/hipscat/io/parquet_metadata.py index 97037665..047d603b 100644 --- a/src/hipscat/io/parquet_metadata.py +++ b/src/hipscat/io/parquet_metadata.py @@ -33,7 +33,18 @@ def write_parquet_metadata(catalog_path, storage_options: dict = None, output_pa output_path (str): base path for writing out metadata files defaults to `catalog_path` if unspecified """ - dataset = file_io.read_parquet_dataset(catalog_path, storage_options=storage_options) + ignore_prefixes = [ + "intermediate", + "_common_metadata", + "_metadata", + ] + + dataset = file_io.read_parquet_dataset( + catalog_path, + storage_options=storage_options, + ignore_prefixes=ignore_prefixes, + exclude_invalid_files=True, + ) metadata_collector = [] for hips_file in dataset.files: From 59a088554e27c3b1fc3b69ad259da874a831c23c Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 14 Nov 2023 12:09:49 -0500 Subject: [PATCH 3/8] Get rid of partition info files. Address pylint. --- src/hipscat/catalog/partition_info.py | 2 +- tests/data/small_sky/partition_info.csv | 2 -- tests/data/small_sky_order1/partition_info.csv | 5 ----- tests/data/small_sky_source/partition_info.csv | 15 --------------- 4 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 tests/data/small_sky/partition_info.csv delete mode 100644 tests/data/small_sky_order1/partition_info.csv delete mode 100644 tests/data/small_sky_source/partition_info.csv diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index 3230cdbf..b7a07979 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -1,5 +1,5 @@ """Container class to hold per-partition metadata""" -from typing import Any, Dict, List, Union +from typing import List import numpy as np import pandas as pd diff --git a/tests/data/small_sky/partition_info.csv b/tests/data/small_sky/partition_info.csv deleted file mode 100644 index ed015721..00000000 --- a/tests/data/small_sky/partition_info.csv +++ /dev/null @@ -1,2 +0,0 @@ -Norder,Dir,Npix,num_rows -0,0,11,131 diff --git a/tests/data/small_sky_order1/partition_info.csv b/tests/data/small_sky_order1/partition_info.csv deleted file mode 100644 index d15927f2..00000000 --- a/tests/data/small_sky_order1/partition_info.csv +++ /dev/null @@ -1,5 +0,0 @@ -Norder,Dir,Npix,num_rows -1,0,44,42 -1,0,45,29 -1,0,46,42 -1,0,47,18 diff --git a/tests/data/small_sky_source/partition_info.csv b/tests/data/small_sky_source/partition_info.csv deleted file mode 100644 index 7a5f4e9f..00000000 --- a/tests/data/small_sky_source/partition_info.csv +++ /dev/null @@ -1,15 +0,0 @@ -Norder,Dir,Npix,num_rows -0,0,4,50 -1,0,47,2395 -2,0,176,385 -2,0,177,1510 -2,0,178,1634 -2,0,179,1773 -2,0,180,655 -2,0,181,903 -2,0,182,1246 -2,0,183,1143 -2,0,184,1390 -2,0,185,2942 -2,0,186,452 -2,0,187,683 From 816041ae4f68c3c4f75424d39828a7fd9a1e886a Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 14 Nov 2023 12:39:43 -0500 Subject: [PATCH 4/8] ./tests/ pylint --- tests/hipscat/io/file_io/test_file_io.py | 1 - tests/hipscat/io/test_parquet_metadata.py | 1 - tests/hipscat/io/test_validation.py | 1 - tests/hipscat/io/test_write_metadata.py | 104 ---------------------- tests/hipscat/pixel_tree/conftest.py | 1 - 5 files changed, 108 deletions(-) diff --git a/tests/hipscat/io/file_io/test_file_io.py b/tests/hipscat/io/file_io/test_file_io.py index 5e805ea0..005f1935 100644 --- a/tests/hipscat/io/file_io/test_file_io.py +++ b/tests/hipscat/io/file_io/test_file_io.py @@ -8,7 +8,6 @@ from hipscat.io.file_io import ( delete_file, get_file_pointer_from_path, - load_csv_to_pandas, load_json_file, load_parquet_to_pandas, make_directory, diff --git a/tests/hipscat/io/test_parquet_metadata.py b/tests/hipscat/io/test_parquet_metadata.py index 4e2ca22b..77a7da64 100644 --- a/tests/hipscat/io/test_parquet_metadata.py +++ b/tests/hipscat/io/test_parquet_metadata.py @@ -8,7 +8,6 @@ import pyarrow as pa import hipscat.io.parquet_metadata as io -import hipscat.pixel_math as hist from hipscat.io import file_io diff --git a/tests/hipscat/io/test_validation.py b/tests/hipscat/io/test_validation.py index 77c5d768..03a4d959 100644 --- a/tests/hipscat/io/test_validation.py +++ b/tests/hipscat/io/test_validation.py @@ -18,7 +18,6 @@ def test_is_valid_catalog(tmp_path, small_sky_catalog, small_sky_pixels): # The catalog is valid if both the catalog_info and _metadata files exist, # and the catalog_info is in a valid format - partition_info_pointer = paths.get_parquet_metadata_pointer(catalog_dir_pointer) PartitionInfo.from_healpix(small_sky_pixels).write_to_metadata_files(catalog_dir_pointer) assert is_valid_catalog(catalog_dir_pointer) diff --git a/tests/hipscat/io/test_write_metadata.py b/tests/hipscat/io/test_write_metadata.py index 1ee6b910..72b3aaba 100644 --- a/tests/hipscat/io/test_write_metadata.py +++ b/tests/hipscat/io/test_write_metadata.py @@ -1,11 +1,8 @@ """Tests of file IO (reads and writes)""" import os -import shutil import numpy.testing as npt -import pandas as pd -import pyarrow as pa import hipscat.io.write_metadata as io import hipscat.pixel_math as hist @@ -145,107 +142,6 @@ def test_write_partition_info_float(assert_text_file_matches, tmp_path): assert_text_file_matches(expected_lines, metadata_filename) -def test_write_parquet_metadata(tmp_path, small_sky_dir, basic_catalog_parquet_metadata): - """Copy existing catalog and create new metadata files for it""" - catalog_base_dir = os.path.join(tmp_path, "catalog") - shutil.copytree( - small_sky_dir, - catalog_base_dir, - ) - io.write_parquet_metadata(catalog_base_dir) - check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(catalog_base_dir, "_common_metadata"), - basic_catalog_parquet_metadata, - 0, - ) - ## Re-write - should still have the same properties. - io.write_parquet_metadata(catalog_base_dir) - check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(catalog_base_dir, "_common_metadata"), - basic_catalog_parquet_metadata, - 0, - ) - - -def test_write_parquet_metadata_order1(tmp_path, small_sky_order1_dir, basic_catalog_parquet_metadata): - """Copy existing catalog and create new metadata files for it, - using a catalog with multiple files.""" - temp_path = os.path.join(tmp_path, "catalog") - shutil.copytree( - small_sky_order1_dir, - temp_path, - ) - - io.write_parquet_metadata(temp_path) - ## 4 row groups for 4 partitioned parquet files - check_parquet_schema( - os.path.join(temp_path, "_metadata"), - basic_catalog_parquet_metadata, - 4, - ) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(temp_path, "_common_metadata"), - basic_catalog_parquet_metadata, - 0, - ) - - -def test_write_index_parquet_metadata(tmp_path): - """Create an index-like catalog, and test metadata creation.""" - temp_path = os.path.join(tmp_path, "index") - - index_parquet_path = os.path.join(temp_path, "Parts=0", "part_000_of_001.parquet") - file_io.make_directory(os.path.join(temp_path, "Parts=0")) - basic_index = pd.DataFrame({"_hipscat_id": [4000, 4001], "ps1_objid": [700, 800]}) - file_io.write_dataframe_to_parquet(basic_index, index_parquet_path) - - index_catalog_parquet_metadata = pa.schema( - [ - pa.field("_hipscat_id", pa.int64()), - pa.field("ps1_objid", pa.int64()), - ] - ) - - io.write_parquet_metadata(temp_path) - check_parquet_schema(os.path.join(tmp_path, "index", "_metadata"), index_catalog_parquet_metadata) - ## _common_metadata has 0 row groups - check_parquet_schema( - os.path.join(tmp_path, "index", "_common_metadata"), - index_catalog_parquet_metadata, - 0, - ) - - -def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1): - """Check parquet schema against expectations""" - assert file_io.does_file_or_directory_exist(file_name), f"file not found [{file_name}]" - - single_metadata = file_io.read_parquet_metadata(file_name) - schema = single_metadata.schema.to_arrow_schema() - - assert len(schema) == len( - expected_schema - ), f"object list not the same size ({len(schema)} vs {len(expected_schema)})" - - npt.assert_array_equal(schema.names, expected_schema.names) - - assert schema.equals(expected_schema, check_metadata=False) - - parquet_file = file_io.read_parquet_file(file_name) - assert parquet_file.metadata.num_row_groups == expected_num_row_groups - - for row_index in range(0, parquet_file.metadata.num_row_groups): - row_md = parquet_file.metadata.row_group(row_index) - for column_index in range(0, row_md.num_columns): - column_metadata = row_md.column(column_index) - assert column_metadata.file_path.endswith(".parquet") - - def test_read_write_fits_point_map(tmp_path): """Check that we write and can read a FITS file for spatial distribution.""" initial_histogram = hist.empty_histogram(1) diff --git a/tests/hipscat/pixel_tree/conftest.py b/tests/hipscat/pixel_tree/conftest.py index 58499324..a2b91384 100644 --- a/tests/hipscat/pixel_tree/conftest.py +++ b/tests/hipscat/pixel_tree/conftest.py @@ -2,7 +2,6 @@ import pytest -from hipscat.catalog import PartitionInfo from hipscat.pixel_math import HealpixPixel from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder From d8dec2e27785850b15b2b2afd5c79965db16897a Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 14 Nov 2023 12:50:07 -0500 Subject: [PATCH 5/8] =[ --- src/hipscat/catalog/partition_info.py | 4 ++-- src/hipscat/io/validation.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index b7a07979..2983e526 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -84,8 +84,8 @@ def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None """ pixel_list = [ HealpixPixel( - row_group_stat_single_value(row_group,cls.METADATA_ORDER_COLUMN_NAME), - row_group_stat_single_value(row_group,cls.METADATA_PIXEL_COLUMN_NAME), + row_group_stat_single_value(row_group, cls.METADATA_ORDER_COLUMN_NAME), + row_group_stat_single_value(row_group, cls.METADATA_PIXEL_COLUMN_NAME), ) for row_group in read_row_group_fragments(metadata_file, storage_options) ] diff --git a/src/hipscat/io/validation.py b/src/hipscat/io/validation.py index b62c8f37..997a4d15 100644 --- a/src/hipscat/io/validation.py +++ b/src/hipscat/io/validation.py @@ -1,5 +1,5 @@ from hipscat.catalog.dataset.catalog_info_factory import from_catalog_dir -from hipscat.io import get_partition_info_pointer, get_parquet_metadata_pointer +from hipscat.io import get_parquet_metadata_pointer, get_partition_info_pointer from hipscat.io.file_io.file_pointer import FilePointer, is_regular_file From 6a56faf0bcf7b56cfc6212bf320fc52d4c4859d0 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 14 Nov 2023 15:43:52 -0500 Subject: [PATCH 6/8] Improve test coverage. --- tests/hipscat/catalog/test_partition_info.py | 5 ++ tests/hipscat/io/conftest.py | 31 ++++++++++ tests/hipscat/io/test_parquet_metadata.py | 61 +++++++++++--------- tests/hipscat/io/test_write_metadata.py | 29 ++++++++++ 4 files changed, 99 insertions(+), 27 deletions(-) diff --git a/tests/hipscat/catalog/test_partition_info.py b/tests/hipscat/catalog/test_partition_info.py index b82ec6c3..3a5d5121 100644 --- a/tests/hipscat/catalog/test_partition_info.py +++ b/tests/hipscat/catalog/test_partition_info.py @@ -54,6 +54,11 @@ def test_load_partition_no_file(tmp_path): with pytest.raises(FileNotFoundError): PartitionInfo.read_from_file(wrong_pointer) + wrong_path = os.path.join(tmp_path, "partition_info.csv") + wrong_pointer = file_io.get_file_pointer_from_path(wrong_path) + with pytest.raises(FileNotFoundError): + PartitionInfo.read_from_csv(wrong_pointer) + def test_get_highest_order(small_sky_order1_dir): """test the `get_highest_order` method""" diff --git a/tests/hipscat/io/conftest.py b/tests/hipscat/io/conftest.py index 1e787eda..2f154357 100644 --- a/tests/hipscat/io/conftest.py +++ b/tests/hipscat/io/conftest.py @@ -2,9 +2,11 @@ import re +import numpy.testing as npt import pyarrow as pa import pytest +from hipscat.io import file_io from hipscat.io.file_io.file_io import load_text_file from hipscat.io.file_io.file_pointer import does_file_or_directory_exist @@ -58,3 +60,32 @@ def basic_catalog_parquet_metadata(): pa.field("__index_level_0__", pa.int64()), ] ) + + +@pytest.fixture +def check_parquet_schema(): + def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1): + """Check parquet schema against expectations""" + assert file_io.does_file_or_directory_exist(file_name), f"file not found [{file_name}]" + + single_metadata = file_io.read_parquet_metadata(file_name) + schema = single_metadata.schema.to_arrow_schema() + + assert len(schema) == len( + expected_schema + ), f"object list not the same size ({len(schema)} vs {len(expected_schema)})" + + npt.assert_array_equal(schema.names, expected_schema.names) + + assert schema.equals(expected_schema, check_metadata=False) + + parquet_file = file_io.read_parquet_file(file_name) + assert parquet_file.metadata.num_row_groups == expected_num_row_groups + + for row_index in range(0, parquet_file.metadata.num_row_groups): + row_md = parquet_file.metadata.row_group(row_index) + for column_index in range(0, row_md.num_columns): + column_metadata = row_md.column(column_index) + assert column_metadata.file_path.endswith(".parquet") + + return check_parquet_schema diff --git a/tests/hipscat/io/test_parquet_metadata.py b/tests/hipscat/io/test_parquet_metadata.py index 77a7da64..b7e2bd18 100644 --- a/tests/hipscat/io/test_parquet_metadata.py +++ b/tests/hipscat/io/test_parquet_metadata.py @@ -3,22 +3,28 @@ import os import shutil -import numpy.testing as npt import pandas as pd import pyarrow as pa +import pytest -import hipscat.io.parquet_metadata as io -from hipscat.io import file_io +from hipscat.io import file_io, paths +from hipscat.io.parquet_metadata import ( + read_row_group_fragments, + row_group_stat_single_value, + write_parquet_metadata, +) -def test_write_parquet_metadata(tmp_path, small_sky_dir, basic_catalog_parquet_metadata): +def test_write_parquet_metadata( + tmp_path, small_sky_dir, basic_catalog_parquet_metadata, check_parquet_schema +): """Copy existing catalog and create new metadata files for it""" catalog_base_dir = os.path.join(tmp_path, "catalog") shutil.copytree( small_sky_dir, catalog_base_dir, ) - io.write_parquet_metadata(catalog_base_dir) + write_parquet_metadata(catalog_base_dir) check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) ## _common_metadata has 0 row groups check_parquet_schema( @@ -27,7 +33,7 @@ def test_write_parquet_metadata(tmp_path, small_sky_dir, basic_catalog_parquet_m 0, ) ## Re-write - should still have the same properties. - io.write_parquet_metadata(catalog_base_dir) + write_parquet_metadata(catalog_base_dir) check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) ## _common_metadata has 0 row groups check_parquet_schema( @@ -37,7 +43,9 @@ def test_write_parquet_metadata(tmp_path, small_sky_dir, basic_catalog_parquet_m ) -def test_write_parquet_metadata_order1(tmp_path, small_sky_order1_dir, basic_catalog_parquet_metadata): +def test_write_parquet_metadata_order1( + tmp_path, small_sky_order1_dir, basic_catalog_parquet_metadata, check_parquet_schema +): """Copy existing catalog and create new metadata files for it, using a catalog with multiple files.""" temp_path = os.path.join(tmp_path, "catalog") @@ -46,7 +54,7 @@ def test_write_parquet_metadata_order1(tmp_path, small_sky_order1_dir, basic_cat temp_path, ) - io.write_parquet_metadata(temp_path) + write_parquet_metadata(temp_path) ## 4 row groups for 4 partitioned parquet files check_parquet_schema( os.path.join(temp_path, "_metadata"), @@ -61,7 +69,7 @@ def test_write_parquet_metadata_order1(tmp_path, small_sky_order1_dir, basic_cat ) -def test_write_index_parquet_metadata(tmp_path): +def test_write_index_parquet_metadata(tmp_path, check_parquet_schema): """Create an index-like catalog, and test metadata creation.""" temp_path = os.path.join(tmp_path, "index") @@ -77,7 +85,7 @@ def test_write_index_parquet_metadata(tmp_path): ] ) - io.write_parquet_metadata(temp_path) + write_parquet_metadata(temp_path) check_parquet_schema(os.path.join(tmp_path, "index", "_metadata"), index_catalog_parquet_metadata) ## _common_metadata has 0 row groups check_parquet_schema( @@ -87,26 +95,25 @@ def test_write_index_parquet_metadata(tmp_path): ) -def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1): - """Check parquet schema against expectations""" - assert file_io.does_file_or_directory_exist(file_name), f"file not found [{file_name}]" +def test_row_group_fragments(small_sky_order1_dir): + partition_info_file = paths.get_parquet_metadata_pointer(small_sky_order1_dir) - single_metadata = file_io.read_parquet_metadata(file_name) - schema = single_metadata.schema.to_arrow_schema() + num_row_groups = 0 + for _ in read_row_group_fragments(partition_info_file): + num_row_groups += 1 - assert len(schema) == len( - expected_schema - ), f"object list not the same size ({len(schema)} vs {len(expected_schema)})" + assert num_row_groups == 4 - npt.assert_array_equal(schema.names, expected_schema.names) - assert schema.equals(expected_schema, check_metadata=False) +def test_row_group_stats(small_sky_dir): + partition_info_file = paths.get_parquet_metadata_pointer(small_sky_dir) + first_row_group = next(read_row_group_fragments(partition_info_file)) - parquet_file = file_io.read_parquet_file(file_name) - assert parquet_file.metadata.num_row_groups == expected_num_row_groups + assert row_group_stat_single_value(first_row_group, "Norder") == 0 + assert row_group_stat_single_value(first_row_group, "Npix") == 11 - for row_index in range(0, parquet_file.metadata.num_row_groups): - row_md = parquet_file.metadata.row_group(row_index) - for column_index in range(0, row_md.num_columns): - column_metadata = row_md.column(column_index) - assert column_metadata.file_path.endswith(".parquet") + with pytest.raises(ValueError, match="doesn't have expected key"): + row_group_stat_single_value(first_row_group, "NOT HERE") + + with pytest.raises(ValueError, match="stat min != max"): + row_group_stat_single_value(first_row_group, "ra") diff --git a/tests/hipscat/io/test_write_metadata.py b/tests/hipscat/io/test_write_metadata.py index 72b3aaba..5f563edc 100644 --- a/tests/hipscat/io/test_write_metadata.py +++ b/tests/hipscat/io/test_write_metadata.py @@ -1,6 +1,7 @@ """Tests of file IO (reads and writes)""" import os +import shutil import numpy.testing as npt @@ -142,6 +143,34 @@ def test_write_partition_info_float(assert_text_file_matches, tmp_path): assert_text_file_matches(expected_lines, metadata_filename) +def test_write_parquet_metadata( + tmp_path, small_sky_dir, basic_catalog_parquet_metadata, check_parquet_schema +): + """Copy existing catalog and create new metadata files for it""" + catalog_base_dir = os.path.join(tmp_path, "catalog") + shutil.copytree( + small_sky_dir, + catalog_base_dir, + ) + io.write_parquet_metadata(catalog_base_dir) + check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) + ## _common_metadata has 0 row groups + check_parquet_schema( + os.path.join(catalog_base_dir, "_common_metadata"), + basic_catalog_parquet_metadata, + 0, + ) + ## Re-write - should still have the same properties. + io.write_parquet_metadata(catalog_base_dir) + check_parquet_schema(os.path.join(catalog_base_dir, "_metadata"), basic_catalog_parquet_metadata) + ## _common_metadata has 0 row groups + check_parquet_schema( + os.path.join(catalog_base_dir, "_common_metadata"), + basic_catalog_parquet_metadata, + 0, + ) + + def test_read_write_fits_point_map(tmp_path): """Check that we write and can read a FITS file for spatial distribution.""" initial_histogram = hist.empty_histogram(1) From 4fdd21ec274c0ab8431687b3d7ff87979b83ecce Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Fri, 17 Nov 2023 15:36:47 -0500 Subject: [PATCH 7/8] PR responses. --- src/hipscat/catalog/partition_info.py | 16 +++++++++------- src/hipscat/io/parquet_metadata.py | 15 +++++++++++++-- src/hipscat/io/validation.py | 14 +++++++------- src/hipscat/pixel_math/healpix_pixel.py | 2 +- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index 2983e526..8d75eeeb 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -1,4 +1,6 @@ """Container class to hold per-partition metadata""" +from __future__ import annotations + from typing import List import numpy as np @@ -55,7 +57,7 @@ def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: di Args: catalog_path (str): base path for the catalog - storage_options: dictionary that contains abstract filesystem credentials + storage_options (dict): dictionary that contains abstract filesystem credentials """ batches = [ pa.RecordBatch.from_arrays( @@ -72,12 +74,12 @@ def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: di write_parquet_metadata_for_batches(batches, catalog_path, storage_options) @classmethod - def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None): + def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None) -> PartitionInfo: """Read partition info from a `_metadata` file to create an object Args: - metadata_file: FilePointer to the `_metadata` file - storage_options: dictionary that contains abstract filesystem credentials + metadata_file (FilePointer): FilePointer to the `_metadata` file + storage_options (dict): dictionary that contains abstract filesystem credentials Returns: A `PartitionInfo` object with the data from the file @@ -97,12 +99,12 @@ def read_from_file(cls, metadata_file: FilePointer, storage_options: dict = None return cls(pixel_list) @classmethod - def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = None): + def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = None) -> PartitionInfo: """Read partition info from a `partition_info.csv` file to create an object Args: partition_info_file: FilePointer to the `partition_info.csv` file - storage_options: dictionary that contains abstract filesystem credentials + storage_options (dict): dictionary that contains abstract filesystem credentials Returns: A `PartitionInfo` object with the data from the file @@ -142,7 +144,7 @@ def as_dataframe(self): return pd.DataFrame.from_dict(partition_info_dict) @classmethod - def from_healpix(cls, healpix_pixels: List[HealpixPixel]): + def from_healpix(cls, healpix_pixels: List[HealpixPixel]) -> PartitionInfo: """Create a partition info object from a list of constituent healpix pixels. Args: diff --git a/src/hipscat/io/parquet_metadata.py b/src/hipscat/io/parquet_metadata.py index 047d603b..0d26fb21 100644 --- a/src/hipscat/io/parquet_metadata.py +++ b/src/hipscat/io/parquet_metadata.py @@ -12,7 +12,12 @@ def row_group_stat_single_value(row_group, stat_key): """Convenience method to find the min and max inside a statistics dictionary, - and raise an error if they're unequal.""" + and raise an error if they're unequal. + + Args: + row_group: dataset fragment row group + stat_key (str): column name of interest. + """ if stat_key not in row_group.statistics: raise ValueError(f"row group doesn't have expected key {stat_key}") stat_dict = row_group.statistics[stat_key] @@ -25,7 +30,10 @@ def row_group_stat_single_value(row_group, stat_key): def write_parquet_metadata(catalog_path, storage_options: dict = None, output_path: str = None): """Generate parquet metadata, using the already-partitioned parquet files - for this catalog + for this catalog. + + For more information on the general parquet metadata files, and why we write them, see + https://arrow.apache.org/docs/python/parquet.html#writing-metadata-and-common-metadata-files Args: catalog_path (str): base path for the catalog @@ -50,6 +58,8 @@ def write_parquet_metadata(catalog_path, storage_options: dict = None, output_pa for hips_file in dataset.files: hips_file_pointer = file_io.get_file_pointer_from_path(hips_file, include_protocol=catalog_path) single_metadata = file_io.read_parquet_metadata(hips_file_pointer, storage_options=storage_options) + + # Users must set the file path of each chunk before combining the metadata. relative_path = hips_file[len(catalog_path) :] single_metadata.set_file_path(relative_path) metadata_collector.append(single_metadata) @@ -78,6 +88,7 @@ def write_parquet_metadata_for_batches( ): """Write parquet metadata files for some pyarrow table batches. This writes the batches to a temporary parquet dataset using local storage, and + generates the metadata for the partitioned catalog parquet files. Args: batches (List[pa.RecordBatch]): create one batch per group of data (partition or row group) diff --git a/src/hipscat/io/validation.py b/src/hipscat/io/validation.py index 997a4d15..4851bc21 100644 --- a/src/hipscat/io/validation.py +++ b/src/hipscat/io/validation.py @@ -7,7 +7,7 @@ def is_valid_catalog(pointer: FilePointer) -> bool: """Checks if a catalog is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if both the catalog_info and partition_info files are @@ -16,11 +16,11 @@ def is_valid_catalog(pointer: FilePointer) -> bool: return is_catalog_info_valid(pointer) and (is_partition_info_valid(pointer) or is_metadata_valid(pointer)) -def is_catalog_info_valid(pointer): +def is_catalog_info_valid(pointer: FilePointer): """Checks if catalog_info is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the catalog_info file exists, and it is correctly formatted, @@ -34,11 +34,11 @@ def is_catalog_info_valid(pointer): return is_valid -def is_partition_info_valid(pointer): +def is_partition_info_valid(pointer: FilePointer): """Checks if partition_info is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the partition_info file exists, False otherwise @@ -48,11 +48,11 @@ def is_partition_info_valid(pointer): return partition_info_exists -def is_metadata_valid(pointer): +def is_metadata_valid(pointer: FilePointer): """Checks if _metadata is valid for a given base catalog pointer Args: - pointer: pointer to base catalog directory + pointer (FilePointer): pointer to base catalog directory Returns: True if the _metadata file exists, False otherwise diff --git a/src/hipscat/pixel_math/healpix_pixel.py b/src/hipscat/pixel_math/healpix_pixel.py index fe317f82..08e578fa 100644 --- a/src/hipscat/pixel_math/healpix_pixel.py +++ b/src/hipscat/pixel_math/healpix_pixel.py @@ -82,7 +82,7 @@ def convert_to_higher_order(self, delta_order: int) -> List[HealpixPixel]: return pixels @property - def dir(self): + def dir(self) -> int: """Directory number for the pixel. This is necessary for file systems that limit to 10,000 subdirectories. From 22d61c99b30bf602f852aec34b660c0cd2b05b8d Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:02:32 -0500 Subject: [PATCH 8/8] Apply suggestions from code review Co-authored-by: Sandro Campos --- src/hipscat/catalog/partition_info.py | 4 ++-- src/hipscat/io/parquet_metadata.py | 8 +++++--- src/hipscat/io/validation.py | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py index 8d75eeeb..49a3ce9b 100644 --- a/src/hipscat/catalog/partition_info.py +++ b/src/hipscat/catalog/partition_info.py @@ -56,7 +56,7 @@ def write_to_metadata_files(self, catalog_path: FilePointer, storage_options: di """Generate parquet metadata, using the known partitions. Args: - catalog_path (str): base path for the catalog + catalog_path (FilePointer): base path for the catalog storage_options (dict): dictionary that contains abstract filesystem credentials """ batches = [ @@ -103,7 +103,7 @@ def read_from_csv(cls, partition_info_file: FilePointer, storage_options: dict = """Read partition info from a `partition_info.csv` file to create an object Args: - partition_info_file: FilePointer to the `partition_info.csv` file + partition_info_file (FilePointer): FilePointer to the `partition_info.csv` file storage_options (dict): dictionary that contains abstract filesystem credentials Returns: diff --git a/src/hipscat/io/parquet_metadata.py b/src/hipscat/io/parquet_metadata.py index 0d26fb21..9aa84b3c 100644 --- a/src/hipscat/io/parquet_metadata.py +++ b/src/hipscat/io/parquet_metadata.py @@ -10,13 +10,15 @@ from hipscat.io.file_io.file_pointer import get_fs, strip_leading_slash_for_pyarrow -def row_group_stat_single_value(row_group, stat_key): +def row_group_stat_single_value(row_group, stat_key: str): """Convenience method to find the min and max inside a statistics dictionary, and raise an error if they're unequal. Args: row_group: dataset fragment row group stat_key (str): column name of interest. + Returns: + The value of the specified row group statistic """ if stat_key not in row_group.statistics: raise ValueError(f"row group doesn't have expected key {stat_key}") @@ -28,7 +30,7 @@ def row_group_stat_single_value(row_group, stat_key): return min_val -def write_parquet_metadata(catalog_path, storage_options: dict = None, output_path: str = None): +def write_parquet_metadata(catalog_path: str, storage_options: dict = None, output_path: str = None): """Generate parquet metadata, using the already-partitioned parquet files for this catalog. @@ -104,7 +106,7 @@ def write_parquet_metadata_for_batches( write_parquet_metadata(temp_pq_file, storage_options=storage_options, output_path=output_path) -def read_row_group_fragments(metadata_file, storage_options: dict = None): +def read_row_group_fragments(metadata_file: str, storage_options: dict = None): """Generator for metadata fragment row groups in a parquet metadata file. Args: diff --git a/src/hipscat/io/validation.py b/src/hipscat/io/validation.py index 4851bc21..877e7a98 100644 --- a/src/hipscat/io/validation.py +++ b/src/hipscat/io/validation.py @@ -16,7 +16,7 @@ def is_valid_catalog(pointer: FilePointer) -> bool: return is_catalog_info_valid(pointer) and (is_partition_info_valid(pointer) or is_metadata_valid(pointer)) -def is_catalog_info_valid(pointer: FilePointer): +def is_catalog_info_valid(pointer: FilePointer) -> bool: """Checks if catalog_info is valid for a given base catalog pointer Args: @@ -34,7 +34,7 @@ def is_catalog_info_valid(pointer: FilePointer): return is_valid -def is_partition_info_valid(pointer: FilePointer): +def is_partition_info_valid(pointer: FilePointer) -> bool: """Checks if partition_info is valid for a given base catalog pointer Args: @@ -48,7 +48,7 @@ def is_partition_info_valid(pointer: FilePointer): return partition_info_exists -def is_metadata_valid(pointer: FilePointer): +def is_metadata_valid(pointer: FilePointer) -> bool: """Checks if _metadata is valid for a given base catalog pointer Args: