diff --git a/src/hipscat/catalog/__init__.py b/src/hipscat/catalog/__init__.py index 9d3247cb..9de6a40a 100644 --- a/src/hipscat/catalog/__init__.py +++ b/src/hipscat/catalog/__init__.py @@ -4,4 +4,5 @@ from .catalog import Catalog from .catalog_type import CatalogType from .dataset.dataset import Dataset +from .margin_cache.margin_catalog import MarginCatalog from .partition_info import PartitionInfo diff --git a/src/hipscat/catalog/catalog.py b/src/hipscat/catalog/catalog.py index f56df3e6..6e3ea035 100644 --- a/src/hipscat/catalog/catalog.py +++ b/src/hipscat/catalog/catalog.py @@ -29,7 +29,7 @@ class Catalog(HealpixDataset): `Norder=/Dir=/Npix=.parquet` """ - HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE, CatalogType.MARGIN] + HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE] # Update CatalogInfoClass, used to check if the catalog_info is the correct type, and # set the catalog info to the correct type diff --git a/src/hipscat/catalog/margin_cache/margin_catalog.py b/src/hipscat/catalog/margin_cache/margin_catalog.py new file mode 100644 index 00000000..743f9460 --- /dev/null +++ b/src/hipscat/catalog/margin_cache/margin_catalog.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from typing_extensions import TypeAlias + +from hipscat.catalog.catalog_type import CatalogType +from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset, PixelInputTypes +from hipscat.catalog.margin_cache import MarginCacheCatalogInfo + + +class MarginCatalog(HealpixDataset): + """A HiPSCat Catalog used to contain the 'margin' of another HiPSCat catalog. + + Catalogs of this type are used alongside a primary catalog, and contains the margin points for each + HEALPix pixel - any points that are within a certain distance from the HEALPix pixel boundary. This is + used to ensure spatial operations such as crossmatching can be performed efficiently while maintaining + accuracy. + """ + # Update CatalogInfoClass, used to check if the catalog_info is the correct type, and + # set the catalog info to the correct type + CatalogInfoClass: TypeAlias = MarginCacheCatalogInfo + catalog_info: CatalogInfoClass + + def __init__( + self, + catalog_info: CatalogInfoClass, + pixels: PixelInputTypes, + catalog_path: str = None, + storage_options: dict | None = None, + ) -> None: + """Initializes a Margin Catalog + + Args: + catalog_info: CatalogInfo object with catalog metadata + pixels: Specifies the pixels contained in the catalog. Can be either a + list of HealpixPixel, `PartitionInfo object`, or a `PixelTree` object + catalog_path: If the catalog is stored on disk, specify the location of the catalog + Does not load the catalog from this path, only store as metadata + storage_options: dictionary that contains abstract filesystem credentials + """ + if catalog_info.catalog_type != CatalogType.MARGIN: + raise ValueError( + f"Catalog info `catalog_type` must equal {CatalogType.MARGIN}" + ) + super().__init__(catalog_info, pixels, catalog_path, storage_options) diff --git a/tests/conftest.py b/tests/conftest.py index 477a0526..3a02f881 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo from hipscat.catalog.catalog_info import CatalogInfo from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo +from hipscat.catalog.margin_cache import MarginCacheCatalogInfo from hipscat.inspection.almanac import Almanac from hipscat.pixel_math import HealpixPixel @@ -126,7 +127,7 @@ def source_catalog_info_with_extra() -> dict: @pytest.fixture -def margin_cache_catalog_info() -> dict: +def margin_cache_catalog_info_data() -> dict: return { "catalog_name": "test_margin", "catalog_type": "margin", @@ -189,6 +190,23 @@ def catalog_info(catalog_info_data) -> CatalogInfo: return CatalogInfo(**catalog_info_data) +@pytest.fixture +def margin_catalog_info(margin_cache_catalog_info_data) -> MarginCacheCatalogInfo: + return MarginCacheCatalogInfo(**margin_cache_catalog_info_data) + + +@pytest.fixture +def margin_catalog_pixels() -> List[HealpixPixel]: + return [ + HealpixPixel(0, 4), HealpixPixel(1, 44), HealpixPixel(1, 45), HealpixPixel(1, 46), HealpixPixel(1, 47) + ] + + +@pytest.fixture +def margin_catalog_path(test_data_dir) -> str: + return os.path.join(test_data_dir, "small_sky_order1_margin") + + @pytest.fixture def catalog_pixels() -> List[HealpixPixel]: return [HealpixPixel(1, 0), HealpixPixel(1, 1), HealpixPixel(2, 8)] diff --git a/tests/data/small_sky_order1_margin/Norder=0/Dir=0/Npix=4.parquet b/tests/data/small_sky_order1_margin/Norder=0/Dir=0/Npix=4.parquet new file mode 100644 index 00000000..bffbc12a Binary files /dev/null and b/tests/data/small_sky_order1_margin/Norder=0/Dir=0/Npix=4.parquet differ diff --git a/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=44.parquet b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=44.parquet new file mode 100644 index 00000000..918fcac6 Binary files /dev/null and b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=44.parquet differ diff --git a/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=45.parquet b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=45.parquet new file mode 100644 index 00000000..2e28861e Binary files /dev/null and b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=45.parquet differ diff --git a/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=46.parquet b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=46.parquet new file mode 100644 index 00000000..5648407b Binary files /dev/null and b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=46.parquet differ diff --git a/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=47.parquet b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=47.parquet new file mode 100644 index 00000000..43a2482c Binary files /dev/null and b/tests/data/small_sky_order1_margin/Norder=1/Dir=0/Npix=47.parquet differ diff --git a/tests/data/small_sky_order1_margin/README.md b/tests/data/small_sky_order1_margin/README.md new file mode 100644 index 00000000..e16e1987 --- /dev/null +++ b/tests/data/small_sky_order1_margin/README.md @@ -0,0 +1,29 @@ +# Catalog description + +This catalog exists as an margin cache of the small_sky_order1 table, +allowing spatial operations to be performed efficiently and accurately. + +This catalog was generated using the following snippet: + +``` +from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments +from hipscat_import.margin_cache import generate_margin_cache + +margin_args = MarginCacheArguments( + margin_threshold=7200, + input_catalog_path="data/small_sky_order1", + output_path="data/", + output_artifact_name="small_sky_order1_margin" +) + + +if __name__ == "__main__": + generate_margin_cache(margin_args, client) +``` + +NB: + +- The setting `margin_threshold` at 7200 arcseconds (2 degrees) is much higher than + a usual margin cache would be generated at, but is used because the small sky test + dataset is sparse. +- The `small_sky_order1` catalog only contains points in Norder1, Npix=[44, 45, 46, 47], but the margin catalog also contains points in Norder0, Npix=4 due to negative pixel margins. diff --git a/tests/data/small_sky_order1_margin/_common_metadata b/tests/data/small_sky_order1_margin/_common_metadata new file mode 100644 index 00000000..64c19780 Binary files /dev/null and b/tests/data/small_sky_order1_margin/_common_metadata differ diff --git a/tests/data/small_sky_order1_margin/_metadata b/tests/data/small_sky_order1_margin/_metadata new file mode 100644 index 00000000..57346559 Binary files /dev/null and b/tests/data/small_sky_order1_margin/_metadata differ diff --git a/tests/data/small_sky_order1_margin/catalog_info.json b/tests/data/small_sky_order1_margin/catalog_info.json new file mode 100644 index 00000000..5d732fb5 --- /dev/null +++ b/tests/data/small_sky_order1_margin/catalog_info.json @@ -0,0 +1,7 @@ +{ + "catalog_name": "small_sky_order1_margin", + "catalog_type": "margin", + "total_rows": 28, + "primary_catalog": "small_sky_order1", + "margin_threshold": 7200 +} diff --git a/tests/data/small_sky_order1_margin/provenance_info.json b/tests/data/small_sky_order1_margin/provenance_info.json new file mode 100644 index 00000000..cb4a3dac --- /dev/null +++ b/tests/data/small_sky_order1_margin/provenance_info.json @@ -0,0 +1,28 @@ +{ + "catalog_name": "small_sky_order1_margin", + "catalog_type": "margin", + "total_rows": 28, + "primary_catalog": "data/small_sky_order1", + "margin_threshold": 7200, + "version": "0.2.3", + "generation_date": "2024.01.30", + "tool_args": { + "tool_name": "hipscat_import", + "version": "0.2.2", + "runtime_args": { + "catalog_name": "small_sky_order1_margin", + "output_path": "data/", + "output_artifact_name": "small_sky_order1_margin", + "tmp_dir": "", + "overwrite": false, + "dask_tmp": "", + "dask_n_workers": 1, + "dask_threads_per_worker": 1, + "catalog_path": "data/small_sky_order1_margin", + "tmp_path": "data/small_sky_order1_margin/intermediate", + "input_catalog_path": "data/small_sky_order1", + "margin_threshold": 7200, + "margin_order": 2 + } + } +} diff --git a/tests/hipscat/catalog/dataset/test_catalog_info_factory.py b/tests/hipscat/catalog/dataset/test_catalog_info_factory.py index 5c4bfc6c..b0cd28bc 100644 --- a/tests/hipscat/catalog/dataset/test_catalog_info_factory.py +++ b/tests/hipscat/catalog/dataset/test_catalog_info_factory.py @@ -50,8 +50,8 @@ def test_create_catalog_info_source(source_catalog_info, source_catalog_info_wit assert isinstance(catalog_info, SourceCatalogInfo) -def test_create_catalog_info_margin_cache(margin_cache_catalog_info): - catalog_info = create_catalog_info(margin_cache_catalog_info) +def test_create_catalog_info_margin_cache(margin_cache_catalog_info_data): + catalog_info = create_catalog_info(margin_cache_catalog_info_data) assert catalog_info.catalog_name == "test_margin" assert isinstance(catalog_info, BaseCatalogInfo) assert isinstance(catalog_info, MarginCacheCatalogInfo) diff --git a/tests/hipscat/catalog/margin_cache/test_margin_cache_catalog_info.py b/tests/hipscat/catalog/margin_cache/test_margin_cache_catalog_info.py index 2f9fa8ad..262ed268 100644 --- a/tests/hipscat/catalog/margin_cache/test_margin_cache_catalog_info.py +++ b/tests/hipscat/catalog/margin_cache/test_margin_cache_catalog_info.py @@ -8,16 +8,16 @@ from hipscat.io import file_io -def test_margin_cache_catalog_info(margin_cache_catalog_info, assert_catalog_info_matches_dict): - info = MarginCacheCatalogInfo(**margin_cache_catalog_info) - assert_catalog_info_matches_dict(info, margin_cache_catalog_info) +def test_margin_cache_catalog_info(margin_cache_catalog_info_data, assert_catalog_info_matches_dict): + info = MarginCacheCatalogInfo(**margin_cache_catalog_info_data) + assert_catalog_info_matches_dict(info, margin_cache_catalog_info_data) -def test_str(margin_cache_catalog_info): +def test_str(margin_cache_catalog_info_data): correct_string = "" - for name, value in margin_cache_catalog_info.items(): + for name, value in margin_cache_catalog_info_data.items(): correct_string += f" {name} {value}\n" - cat_info = MarginCacheCatalogInfo(**margin_cache_catalog_info) + cat_info = MarginCacheCatalogInfo(**margin_cache_catalog_info_data) assert str(cat_info) == correct_string @@ -38,29 +38,29 @@ def test_read_from_file(margin_cache_catalog_info_file, assert_catalog_info_matc assert_catalog_info_matches_dict(catalog_info, catalog_info_json) -def test_required_fields_missing(margin_cache_catalog_info): +def test_required_fields_missing(margin_cache_catalog_info_data): required_fields = ["primary_catalog", "margin_threshold"] for required_field in required_fields: assert required_field in MarginCacheCatalogInfo.required_fields for field in required_fields: - init_data = margin_cache_catalog_info.copy() + init_data = margin_cache_catalog_info_data.copy() init_data[field] = None with pytest.raises(ValueError, match=field): MarginCacheCatalogInfo(**init_data) -def test_type_missing(margin_cache_catalog_info): - init_data = margin_cache_catalog_info.copy() +def test_type_missing(margin_cache_catalog_info_data): + init_data = margin_cache_catalog_info_data.copy() init_data["catalog_type"] = None catalog_info = MarginCacheCatalogInfo(**init_data) assert catalog_info.catalog_type == CatalogType.MARGIN -def test_wrong_type(margin_cache_catalog_info, catalog_info_data): +def test_wrong_type(margin_cache_catalog_info_data, catalog_info_data): with pytest.raises(TypeError, match="unexpected"): MarginCacheCatalogInfo(**catalog_info_data) with pytest.raises(ValueError, match=f"{CatalogType.MARGIN}"): - init_data = margin_cache_catalog_info.copy() + init_data = margin_cache_catalog_info_data.copy() init_data["catalog_type"] = CatalogType.OBJECT MarginCacheCatalogInfo(**init_data) diff --git a/tests/hipscat/catalog/margin_cache/test_margin_catalog.py b/tests/hipscat/catalog/margin_cache/test_margin_catalog.py new file mode 100644 index 00000000..1e5eaefb --- /dev/null +++ b/tests/hipscat/catalog/margin_cache/test_margin_catalog.py @@ -0,0 +1,77 @@ +import json +import os + +import pytest + +from hipscat.catalog import CatalogType, MarginCatalog, PartitionInfo +from hipscat.pixel_tree.pixel_node_type import PixelNodeType + + +def test_init_catalog(margin_catalog_info, margin_catalog_pixels): + catalog = MarginCatalog( + margin_catalog_info, margin_catalog_pixels + ) + assert catalog.catalog_name == margin_catalog_info.catalog_name + assert catalog.get_healpix_pixels() == margin_catalog_pixels + assert catalog.catalog_info == margin_catalog_info + + assert len(catalog.get_healpix_pixels()) == len(margin_catalog_pixels) + for hp_pixel in catalog.get_healpix_pixels(): + assert hp_pixel in margin_catalog_pixels + assert hp_pixel in catalog.pixel_tree + assert catalog.pixel_tree[hp_pixel].node_type == PixelNodeType.LEAF + + +def test_wrong_catalog_type(margin_catalog_info, margin_catalog_pixels): + margin_catalog_info.catalog_type = CatalogType.OBJECT + with pytest.raises(ValueError, match="catalog_type"): + MarginCatalog(margin_catalog_info, margin_catalog_pixels) + + +def test_wrong_catalog_info_type(catalog_info, margin_catalog_pixels): + catalog_info.catalog_type = CatalogType.MARGIN + with pytest.raises(TypeError, match="catalog_info"): + MarginCatalog(catalog_info, margin_catalog_pixels) + + +def test_read_from_file(margin_catalog_path, margin_catalog_pixels): + catalog = MarginCatalog.read_from_hipscat(margin_catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == margin_catalog_path + assert len(catalog.get_healpix_pixels()) == len(margin_catalog_pixels) + assert catalog.get_healpix_pixels() == margin_catalog_pixels + + info = catalog.catalog_info + assert info.catalog_name == "small_sky_order1_margin" + assert info.catalog_type == CatalogType.MARGIN + assert info.primary_catalog == "small_sky_order1" + assert info.margin_threshold == 7200 + + +def test_empty_directory(tmp_path, margin_cache_catalog_info_data, margin_catalog_pixels): + """Test loading empty or incomplete data""" + ## Path doesn't exist + with pytest.raises(FileNotFoundError): + MarginCatalog.read_from_hipscat(os.path.join("path", "empty")) + + catalog_path = os.path.join(tmp_path, "empty") + os.makedirs(catalog_path, exist_ok=True) + + ## Path exists but there's nothing there + with pytest.raises(FileNotFoundError, match="catalog info"): + MarginCatalog.read_from_hipscat(catalog_path) + + ## catalog_info file exists - getting closer + file_name = os.path.join(catalog_path, "catalog_info.json") + with open(file_name, "w", encoding="utf-8") as metadata_file: + metadata_file.write(json.dumps(margin_cache_catalog_info_data)) + + with pytest.raises(FileNotFoundError, match="metadata"): + MarginCatalog.read_from_hipscat(catalog_path) + + ## Now we create the needed _metadata and everything is right. + part_info = PartitionInfo.from_healpix(margin_catalog_pixels) + part_info.write_to_metadata_files(catalog_path=catalog_path) + + catalog = MarginCatalog.read_from_hipscat(catalog_path) + assert catalog.catalog_name == margin_cache_catalog_info_data["catalog_name"]