Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Margin Catalog #199

Merged
merged 7 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/hipscat/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .catalog import Catalog
from .catalog_type import CatalogType
from .dataset.dataset import Dataset
from .margin_cache.margin_catalog import MarginCatalog
from .partition_info import PartitionInfo
2 changes: 1 addition & 1 deletion src/hipscat/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Catalog(HealpixDataset):
`Norder=/Dir=/Npix=.parquet`
"""

HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE, CatalogType.MARGIN]
HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE]

# Update CatalogInfoClass, used to check if the catalog_info is the correct type, and
# set the catalog info to the correct type
Expand Down
44 changes: 44 additions & 0 deletions src/hipscat/catalog/margin_cache/margin_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from typing_extensions import TypeAlias

from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset, PixelInputTypes
from hipscat.catalog.margin_cache import MarginCacheCatalogInfo


class MarginCatalog(HealpixDataset):
"""A HiPSCat Catalog used to contain the 'margin' of another HiPSCat catalog.

Catalogs of this type are used alongside a primary catalog, and contains the margin points for each
HEALPix pixel - any points that are within a certain distance from the HEALPix pixel boundary. This is
used to ensure spatial operations such as crossmatching can be performed efficiently while maintaining
accuracy.
"""
# Update CatalogInfoClass, used to check if the catalog_info is the correct type, and
# set the catalog info to the correct type
CatalogInfoClass: TypeAlias = MarginCacheCatalogInfo
catalog_info: CatalogInfoClass

def __init__(
self,
catalog_info: CatalogInfoClass,
pixels: PixelInputTypes,
catalog_path: str = None,
storage_options: dict | None = None,
) -> None:
"""Initializes a Margin Catalog

Args:
catalog_info: CatalogInfo object with catalog metadata
pixels: Specifies the pixels contained in the catalog. Can be either a
list of HealpixPixel, `PartitionInfo object`, or a `PixelTree` object
catalog_path: If the catalog is stored on disk, specify the location of the catalog
Does not load the catalog from this path, only store as metadata
storage_options: dictionary that contains abstract filesystem credentials
"""
if catalog_info.catalog_type != CatalogType.MARGIN:
raise ValueError(
f"Catalog info `catalog_type` must equal {CatalogType.MARGIN}"
)
super().__init__(catalog_info, pixels, catalog_path, storage_options)
20 changes: 19 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo
from hipscat.catalog.catalog_info import CatalogInfo
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
from hipscat.catalog.margin_cache import MarginCacheCatalogInfo
from hipscat.inspection.almanac import Almanac
from hipscat.pixel_math import HealpixPixel

Expand Down Expand Up @@ -126,7 +127,7 @@ def source_catalog_info_with_extra() -> dict:


@pytest.fixture
def margin_cache_catalog_info() -> dict:
def margin_cache_catalog_info_data() -> dict:
return {
"catalog_name": "test_margin",
"catalog_type": "margin",
Expand Down Expand Up @@ -189,6 +190,23 @@ def catalog_info(catalog_info_data) -> CatalogInfo:
return CatalogInfo(**catalog_info_data)


@pytest.fixture
def margin_catalog_info(margin_cache_catalog_info_data) -> MarginCacheCatalogInfo:
return MarginCacheCatalogInfo(**margin_cache_catalog_info_data)


@pytest.fixture
def margin_catalog_pixels() -> List[HealpixPixel]:
return [
HealpixPixel(0, 4), HealpixPixel(1, 44), HealpixPixel(1, 45), HealpixPixel(1, 46), HealpixPixel(1, 47)
]


@pytest.fixture
def margin_catalog_path(test_data_dir) -> str:
return os.path.join(test_data_dir, "small_sky_order1_margin")


@pytest.fixture
def catalog_pixels() -> List[HealpixPixel]:
return [HealpixPixel(1, 0), HealpixPixel(1, 1), HealpixPixel(2, 8)]
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
29 changes: 29 additions & 0 deletions tests/data/small_sky_order1_margin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Catalog description

This catalog exists as an margin cache of the small_sky_order1 table,
allowing spatial operations to be performed efficiently and accurately.

This catalog was generated using the following snippet:

```
from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hipscat_import.margin_cache import generate_margin_cache

margin_args = MarginCacheArguments(
margin_threshold=7200,
input_catalog_path="data/small_sky_order1",
output_path="data/",
output_artifact_name="small_sky_order1_margin"
)


if __name__ == "__main__":
generate_margin_cache(margin_args, client)
```

NB:

- The setting `margin_threshold` at 7200 arcseconds (2 degrees) is much higher than
a usual margin cache would be generated at, but is used because the small sky test
dataset is sparse.
smcguire-cmu marked this conversation as resolved.
Show resolved Hide resolved
- The `small_sky_order1` catalog only contains points in Norder1, Npix=[44, 45, 46, 47], but the margin catalog also contains points in Norder0, Npix=4 due to negative pixel margins.
Binary file not shown.
Binary file added tests/data/small_sky_order1_margin/_metadata
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/data/small_sky_order1_margin/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"catalog_name": "small_sky_order1_margin",
"catalog_type": "margin",
"total_rows": 28,
"primary_catalog": "small_sky_order1",
"margin_threshold": 7200
}
28 changes: 28 additions & 0 deletions tests/data/small_sky_order1_margin/provenance_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"catalog_name": "small_sky_order1_margin",
"catalog_type": "margin",
"total_rows": 28,
"primary_catalog": "data/small_sky_order1",
"margin_threshold": 7200,
"version": "0.2.3",
"generation_date": "2024.01.30",
"tool_args": {
"tool_name": "hipscat_import",
"version": "0.2.2",
"runtime_args": {
"catalog_name": "small_sky_order1_margin",
"output_path": "data/",
"output_artifact_name": "small_sky_order1_margin",
"tmp_dir": "",
"overwrite": false,
"dask_tmp": "",
"dask_n_workers": 1,
"dask_threads_per_worker": 1,
"catalog_path": "data/small_sky_order1_margin",
"tmp_path": "data/small_sky_order1_margin/intermediate",
"input_catalog_path": "data/small_sky_order1",
"margin_threshold": 7200,
"margin_order": 2
}
}
}
4 changes: 2 additions & 2 deletions tests/hipscat/catalog/dataset/test_catalog_info_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def test_create_catalog_info_source(source_catalog_info, source_catalog_info_wit
assert isinstance(catalog_info, SourceCatalogInfo)


def test_create_catalog_info_margin_cache(margin_cache_catalog_info):
catalog_info = create_catalog_info(margin_cache_catalog_info)
def test_create_catalog_info_margin_cache(margin_cache_catalog_info_data):
catalog_info = create_catalog_info(margin_cache_catalog_info_data)
assert catalog_info.catalog_name == "test_margin"
assert isinstance(catalog_info, BaseCatalogInfo)
assert isinstance(catalog_info, MarginCacheCatalogInfo)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
from hipscat.io import file_io


def test_margin_cache_catalog_info(margin_cache_catalog_info, assert_catalog_info_matches_dict):
info = MarginCacheCatalogInfo(**margin_cache_catalog_info)
assert_catalog_info_matches_dict(info, margin_cache_catalog_info)
def test_margin_cache_catalog_info(margin_cache_catalog_info_data, assert_catalog_info_matches_dict):
info = MarginCacheCatalogInfo(**margin_cache_catalog_info_data)
assert_catalog_info_matches_dict(info, margin_cache_catalog_info_data)


def test_str(margin_cache_catalog_info):
def test_str(margin_cache_catalog_info_data):
correct_string = ""
for name, value in margin_cache_catalog_info.items():
for name, value in margin_cache_catalog_info_data.items():
correct_string += f" {name} {value}\n"
cat_info = MarginCacheCatalogInfo(**margin_cache_catalog_info)
cat_info = MarginCacheCatalogInfo(**margin_cache_catalog_info_data)
assert str(cat_info) == correct_string


Expand All @@ -38,29 +38,29 @@ def test_read_from_file(margin_cache_catalog_info_file, assert_catalog_info_matc
assert_catalog_info_matches_dict(catalog_info, catalog_info_json)


def test_required_fields_missing(margin_cache_catalog_info):
def test_required_fields_missing(margin_cache_catalog_info_data):
required_fields = ["primary_catalog", "margin_threshold"]
for required_field in required_fields:
assert required_field in MarginCacheCatalogInfo.required_fields
for field in required_fields:
init_data = margin_cache_catalog_info.copy()
init_data = margin_cache_catalog_info_data.copy()
init_data[field] = None
with pytest.raises(ValueError, match=field):
MarginCacheCatalogInfo(**init_data)


def test_type_missing(margin_cache_catalog_info):
init_data = margin_cache_catalog_info.copy()
def test_type_missing(margin_cache_catalog_info_data):
init_data = margin_cache_catalog_info_data.copy()
init_data["catalog_type"] = None
catalog_info = MarginCacheCatalogInfo(**init_data)
assert catalog_info.catalog_type == CatalogType.MARGIN


def test_wrong_type(margin_cache_catalog_info, catalog_info_data):
def test_wrong_type(margin_cache_catalog_info_data, catalog_info_data):
with pytest.raises(TypeError, match="unexpected"):
MarginCacheCatalogInfo(**catalog_info_data)

with pytest.raises(ValueError, match=f"{CatalogType.MARGIN}"):
init_data = margin_cache_catalog_info.copy()
init_data = margin_cache_catalog_info_data.copy()
init_data["catalog_type"] = CatalogType.OBJECT
MarginCacheCatalogInfo(**init_data)
77 changes: 77 additions & 0 deletions tests/hipscat/catalog/margin_cache/test_margin_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import json
import os

import pytest

from hipscat.catalog import CatalogType, MarginCatalog, PartitionInfo
from hipscat.pixel_tree.pixel_node_type import PixelNodeType


def test_init_catalog(margin_catalog_info, margin_catalog_pixels):
catalog = MarginCatalog(
margin_catalog_info, margin_catalog_pixels
)
assert catalog.catalog_name == margin_catalog_info.catalog_name
assert catalog.get_healpix_pixels() == margin_catalog_pixels
assert catalog.catalog_info == margin_catalog_info

assert len(catalog.get_healpix_pixels()) == len(margin_catalog_pixels)
for hp_pixel in catalog.get_healpix_pixels():
assert hp_pixel in margin_catalog_pixels
assert hp_pixel in catalog.pixel_tree
assert catalog.pixel_tree[hp_pixel].node_type == PixelNodeType.LEAF


def test_wrong_catalog_type(margin_catalog_info, margin_catalog_pixels):
margin_catalog_info.catalog_type = CatalogType.OBJECT
with pytest.raises(ValueError, match="catalog_type"):
MarginCatalog(margin_catalog_info, margin_catalog_pixels)


def test_wrong_catalog_info_type(catalog_info, margin_catalog_pixels):
catalog_info.catalog_type = CatalogType.MARGIN
with pytest.raises(TypeError, match="catalog_info"):
MarginCatalog(catalog_info, margin_catalog_pixels)


def test_read_from_file(margin_catalog_path, margin_catalog_pixels):
catalog = MarginCatalog.read_from_hipscat(margin_catalog_path)
assert catalog.on_disk
assert catalog.catalog_path == margin_catalog_path
assert len(catalog.get_healpix_pixels()) == len(margin_catalog_pixels)
assert catalog.get_healpix_pixels() == margin_catalog_pixels

info = catalog.catalog_info
assert info.catalog_name == "small_sky_order1_margin"
assert info.catalog_type == CatalogType.MARGIN
assert info.primary_catalog == "small_sky_order1"
assert info.margin_threshold == 7200


def test_empty_directory(tmp_path, margin_cache_catalog_info_data, margin_catalog_pixels):
"""Test loading empty or incomplete data"""
## Path doesn't exist
with pytest.raises(FileNotFoundError):
MarginCatalog.read_from_hipscat(os.path.join("path", "empty"))

catalog_path = os.path.join(tmp_path, "empty")
os.makedirs(catalog_path, exist_ok=True)

## Path exists but there's nothing there
with pytest.raises(FileNotFoundError, match="catalog info"):
MarginCatalog.read_from_hipscat(catalog_path)

## catalog_info file exists - getting closer
file_name = os.path.join(catalog_path, "catalog_info.json")
with open(file_name, "w", encoding="utf-8") as metadata_file:
metadata_file.write(json.dumps(margin_cache_catalog_info_data))

with pytest.raises(FileNotFoundError, match="metadata"):
MarginCatalog.read_from_hipscat(catalog_path)

## Now we create the needed _metadata and everything is right.
part_info = PartitionInfo.from_healpix(margin_catalog_pixels)
part_info.write_to_metadata_files(catalog_path=catalog_path)

catalog = MarginCatalog.read_from_hipscat(catalog_path)
assert catalog.catalog_name == margin_cache_catalog_info_data["catalog_name"]
Loading