Skip to content

Commit

Permalink
Merge pull request #216 from astronomy-commons/sandro/catalog-factory…
Browse files Browse the repository at this point in the history
…-method

Dataset factory method
  • Loading branch information
camposandro authored Feb 19, 2024
2 parents cb38ea3 + c33a552 commit a051d36
Show file tree
Hide file tree
Showing 14 changed files with 147 additions and 68 deletions.
9 changes: 5 additions & 4 deletions docs/notebooks/catalog_size_inspection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
"metadata": {},
"outputs": [],
"source": [
"from hipscat.catalog.catalog import Catalog\n",
"from hipscat.io import paths\n",
"import hipscat\n",
"import os\n",
"\n",
"### Change this path!!!\n",
Expand All @@ -43,12 +42,14 @@
"### ----------------\n",
"### You probably won't have to change anything from here.\n",
"\n",
"catalog = Catalog.read_from_hipscat(catalog_dir)\n",
"catalog = hipscat.read_from_hipscat(catalog_dir)\n",
"\n",
"info_frame = catalog.partition_info.as_dataframe()\n",
"\n",
"for index, partition in info_frame.iterrows():\n",
" file_name = result = paths.pixel_catalog_file(catalog_dir, partition[\"Norder\"], partition[\"Npix\"])\n",
" file_name = result = hipscat.io.paths.pixel_catalog_file(\n",
" catalog_dir, partition[\"Norder\"], partition[\"Npix\"]\n",
" )\n",
" info_frame.loc[index, \"size_on_disk\"] = os.path.getsize(file_name)\n",
"\n",
"info_frame = info_frame.astype(int)\n",
Expand Down
27 changes: 6 additions & 21 deletions docs/notebooks/cone_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from hipscat.catalog import Catalog\n",
"from hipscat import inspection\n",
"import hipscat\n",
"import healpy as hp\n",
"import numpy as np\n",
"\n",
"## Fill in these variables with what's relevant in your use case:\n",
"\n",
Expand All @@ -39,7 +38,7 @@
"source": [
"## Load catalog\n",
"\n",
"catalog = Catalog.read_from_hipscat(catalog_path)"
"catalog = hipscat.read_from_hipscat(catalog_path)"
]
},
{
Expand All @@ -50,7 +49,7 @@
"source": [
"## Plot catalog pixels\n",
"\n",
"inspection.plot_pixels(catalog)"
"hipscat.inspection.plot_pixels(catalog)"
]
},
{
Expand Down Expand Up @@ -81,27 +80,13 @@
"\n",
"filtered_catalog = catalog.filter_by_cone(ra, dec, radius)\n",
"\n",
"inspection.plot_pixels(filtered_catalog)"
"hipscat.inspection.plot_pixels(filtered_catalog)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "hipscatenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"name": "python"
}
},
"nbformat": 4,
Expand Down
3 changes: 2 additions & 1 deletion src/hipscat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""High-level namespace, hipscat"""

from . import catalog, io, pixel_math
from . import catalog, inspection, io, pixel_math
from .loaders import read_from_hipscat
13 changes: 4 additions & 9 deletions src/hipscat/catalog/dataset/dataset.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
from typing import Any, Dict, Tuple, Union

from typing_extensions import Self, TypeAlias
from typing_extensions import Self

from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
from hipscat.io import FilePointer, file_io, paths


class Dataset:
"""A base HiPSCat dataset
"""A base HiPSCat dataset that contains a catalog_info metadata file
and the data contained in parquet files"""

A base dataset contains a catalog_info metadata file and the data contained in parquet files
TODO - create factory methods to get appropriately-typed datasets for
some catalog info or catalog directory
"""

CatalogInfoClass: TypeAlias = BaseCatalogInfo
CatalogInfoClass = BaseCatalogInfo

def __init__(
self,
Expand Down
1 change: 1 addition & 0 deletions src/hipscat/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .read_from_hipscat import read_from_hipscat
59 changes: 59 additions & 0 deletions src/hipscat/loaders/read_from_hipscat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from typing import Type

from hipscat import io
from hipscat.catalog import AssociationCatalog, Catalog, CatalogType, Dataset, MarginCatalog
from hipscat.catalog.dataset import BaseCatalogInfo
from hipscat.catalog.index.index_catalog import IndexCatalog

CATALOG_TYPE_TO_CLASS = {
CatalogType.OBJECT: Catalog,
CatalogType.SOURCE: Catalog,
CatalogType.ASSOCIATION: AssociationCatalog,
CatalogType.INDEX: IndexCatalog,
CatalogType.MARGIN: MarginCatalog,
}


def read_from_hipscat(
catalog_path: str,
catalog_type: CatalogType | None = None,
storage_options: dict | None = None,
) -> Dataset:
"""Reads a HiPSCat Catalog from a HiPSCat directory
Args:
catalog_path (str): path to the root directory of the catalog
catalog_type (CatalogType): Default `None`. By default, the type of the catalog is loaded
from the catalog info and the corresponding object type is returned. Python's type hints
cannot allow a return type specified by a loaded value, so to use the correct return
type for type checking, the type of the catalog can be specified here. Use by specifying
the hipscat class for that catalog.
storage_options (dict): dictionary that contains abstract filesystem credentials
Returns:
The initialized catalog object
"""
catalog_type_to_use = (
_read_dataset_class_from_metadata(catalog_path, storage_options=storage_options)
if catalog_type is None
else catalog_type
)
loader = _get_loader_from_catalog_type(catalog_type_to_use)
return loader.read_from_hipscat(catalog_path)


def _read_dataset_class_from_metadata(
catalog_base_path: str, storage_options: dict | None = None
) -> CatalogType:
catalog_base_dir = io.file_io.get_file_pointer_from_path(catalog_base_path)
catalog_info_path = io.paths.get_catalog_info_pointer(catalog_base_dir)
catalog_info = BaseCatalogInfo.read_from_metadata_file(catalog_info_path, storage_options=storage_options)
return catalog_info.catalog_type


def _get_loader_from_catalog_type(catalog_type: CatalogType) -> Type[Dataset]:
if catalog_type not in CATALOG_TYPE_TO_CLASS:
raise NotImplementedError(f"Cannot load catalog of type {catalog_type}")
return CATALOG_TYPE_TO_CLASS[catalog_type]
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
SMALL_SKY_DIR_NAME = "small_sky"
SMALL_SKY_ORDER1_DIR_NAME = "small_sky_order1"
SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME = "small_sky_to_small_sky_order1"
SMALL_SKY_SOURCE_OBJECT_INDEX_DIR_NAME = "small_sky_source_object_index"

TEST_DIR = os.path.dirname(__file__)

# pylint: disable=missing-function-docstring, redefined-outer-name
Expand Down Expand Up @@ -48,6 +50,11 @@ def small_sky_to_small_sky_order1_dir(test_data_dir):
return os.path.join(test_data_dir, SMALL_SKY_TO_SMALL_SKY_ORDER1_DIR_NAME)


@pytest.fixture
def small_sky_source_object_index_dir(test_data_dir):
return os.path.join(test_data_dir, SMALL_SKY_SOURCE_OBJECT_INDEX_DIR_NAME)


@pytest.fixture
def assert_catalog_info_matches_dict():
def assert_match(catalog_info: BaseCatalogInfo, dictionary: dict):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from hipscat.catalog import CatalogType
from hipscat.catalog.association_catalog.association_catalog import AssociationCatalog
from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo
from hipscat.loaders import read_from_hipscat
from hipscat.pixel_math import HealpixPixel
from hipscat.pixel_tree.pixel_node_type import PixelNodeType

Expand Down Expand Up @@ -51,7 +52,9 @@ def test_different_join_pixels_type(association_catalog_info, association_catalo


def test_read_from_file(association_catalog_path, association_catalog_join_pixels):
catalog = AssociationCatalog.read_from_hipscat(association_catalog_path)
catalog = read_from_hipscat(association_catalog_path)

assert isinstance(catalog, AssociationCatalog)
assert catalog.on_disk
assert catalog.catalog_path == association_catalog_path
assert len(catalog.get_join_pixels()) == 4
Expand All @@ -70,7 +73,7 @@ def test_empty_directory(tmp_path, association_catalog_info_data, association_ca
"""Test loading empty or incomplete data"""
## Path doesn't exist
with pytest.raises(FileNotFoundError):
AssociationCatalog.read_from_hipscat(os.path.join("path", "empty"))
read_from_hipscat(os.path.join("path", "empty"))

catalog_path = os.path.join(tmp_path, "empty")
os.makedirs(catalog_path, exist_ok=True)
Expand All @@ -85,20 +88,20 @@ def test_empty_directory(tmp_path, association_catalog_info_data, association_ca
metadata_file.write(json.dumps(association_catalog_info_data))

with pytest.raises(FileNotFoundError, match="metadata"):
AssociationCatalog.read_from_hipscat(catalog_path)
read_from_hipscat(catalog_path)

## Now we create the needed _metadata and everything is right.
part_info = PartitionJoinInfo(association_catalog_join_pixels)
part_info.write_to_metadata_files(catalog_path=catalog_path)
with pytest.warns(UserWarning, match="slow"):
catalog = AssociationCatalog.read_from_hipscat(catalog_path)
catalog = read_from_hipscat(catalog_path)
assert catalog.catalog_name == association_catalog_info_data["catalog_name"]


def test_csv_round_trip(tmp_path, association_catalog_info_data, association_catalog_join_pixels):
## Path doesn't exist
with pytest.raises(FileNotFoundError):
AssociationCatalog.read_from_hipscat(os.path.join("path", "empty"))
read_from_hipscat(os.path.join("path", "empty"))

catalog_path = os.path.join(tmp_path, "empty")
os.makedirs(catalog_path, exist_ok=True)
Expand All @@ -108,17 +111,17 @@ def test_csv_round_trip(tmp_path, association_catalog_info_data, association_cat
metadata_file.write(json.dumps(association_catalog_info_data))

with pytest.raises(FileNotFoundError, match="partition"):
AssociationCatalog.read_from_hipscat(catalog_path)
read_from_hipscat(catalog_path)

file_name = os.path.join(catalog_path, "partition_info.csv")
with open(file_name, "w", encoding="utf-8") as metadata_file:
# dump some garbage in there - just needs to exist.
metadata_file.write(json.dumps(association_catalog_info_data))
with pytest.raises(FileNotFoundError, match="partition"):
AssociationCatalog.read_from_hipscat(catalog_path)
read_from_hipscat(catalog_path)

part_info = PartitionJoinInfo(association_catalog_join_pixels)
part_info.write_to_csv(catalog_path=catalog_path)

catalog = AssociationCatalog.read_from_hipscat(catalog_path)
catalog = read_from_hipscat(catalog_path)
pd.testing.assert_frame_equal(catalog.get_join_pixels(), association_catalog_join_pixels)
11 changes: 5 additions & 6 deletions tests/hipscat/catalog/index/test_index_catalog.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import os

import numpy.testing as npt

from hipscat.catalog.index.index_catalog import IndexCatalog
from hipscat.loaders import read_from_hipscat
from hipscat.pixel_math import HealpixPixel


def test_loc_partition(test_data_dir):
index_catalog_dir = os.path.join(test_data_dir, "small_sky_source_object_index")
catalog = IndexCatalog.read_from_hipscat(index_catalog_dir)
def test_loc_partition(small_sky_source_object_index_dir):
catalog = read_from_hipscat(small_sky_source_object_index_dir)

assert isinstance(catalog, IndexCatalog)
assert catalog.on_disk
assert catalog.catalog_path == index_catalog_dir
assert catalog.catalog_path == small_sky_source_object_index_dir

npt.assert_array_equal(catalog.loc_partitions([700]), [HealpixPixel(2, 184)])
npt.assert_array_equal(catalog.loc_partitions([707]), [HealpixPixel(2, 176), HealpixPixel(2, 178)])
Expand Down
11 changes: 11 additions & 0 deletions tests/hipscat/catalog/loaders/test_read_from_hipscat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pytest

from hipscat.catalog import CatalogType
from hipscat.loaders import read_from_hipscat


def test_read_from_hipscat_wrong_catalog_type(small_sky_dir):
with pytest.raises(ValueError, match="must have type"):
read_from_hipscat(small_sky_dir, catalog_type=CatalogType.ASSOCIATION)
with pytest.raises(NotImplementedError, match="load catalog of type"):
read_from_hipscat(small_sky_dir, catalog_type="unknown")
16 changes: 10 additions & 6 deletions tests/hipscat/catalog/margin_cache/test_margin_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from hipscat.catalog import CatalogType, MarginCatalog, PartitionInfo
from hipscat.loaders import read_from_hipscat
from hipscat.pixel_tree.pixel_node_type import PixelNodeType


Expand Down Expand Up @@ -33,7 +34,9 @@ def test_wrong_catalog_info_type(catalog_info, margin_catalog_pixels):


def test_read_from_file(margin_catalog_path, margin_catalog_pixels):
catalog = MarginCatalog.read_from_hipscat(margin_catalog_path)
catalog = read_from_hipscat(margin_catalog_path)

assert isinstance(catalog, MarginCatalog)
assert catalog.on_disk
assert catalog.catalog_path == margin_catalog_path
assert len(catalog.get_healpix_pixels()) == len(margin_catalog_pixels)
Expand All @@ -46,31 +49,32 @@ def test_read_from_file(margin_catalog_path, margin_catalog_pixels):
assert info.margin_threshold == 7200


# pylint: disable=duplicate-code
def test_empty_directory(tmp_path, margin_cache_catalog_info_data, margin_catalog_pixels):
"""Test loading empty or incomplete data"""
## Path doesn't exist
with pytest.raises(FileNotFoundError):
MarginCatalog.read_from_hipscat(os.path.join("path", "empty"))
read_from_hipscat(os.path.join("path", "empty"))

catalog_path = os.path.join(tmp_path, "empty")
os.makedirs(catalog_path, exist_ok=True)

## Path exists but there's nothing there
with pytest.raises(FileNotFoundError, match="catalog info"):
MarginCatalog.read_from_hipscat(catalog_path)
with pytest.raises(FileNotFoundError, match="catalog_info"):
read_from_hipscat(catalog_path)

## catalog_info file exists - getting closer
file_name = os.path.join(catalog_path, "catalog_info.json")
with open(file_name, "w", encoding="utf-8") as metadata_file:
metadata_file.write(json.dumps(margin_cache_catalog_info_data))

with pytest.raises(FileNotFoundError, match="metadata"):
MarginCatalog.read_from_hipscat(catalog_path)
read_from_hipscat(catalog_path)

## Now we create the needed _metadata and everything is right.
part_info = PartitionInfo.from_healpix(margin_catalog_pixels)
part_info.write_to_metadata_files(catalog_path=catalog_path)

with pytest.warns(UserWarning, match="slow"):
catalog = MarginCatalog.read_from_hipscat(catalog_path)
catalog = read_from_hipscat(catalog_path)
assert catalog.catalog_name == margin_cache_catalog_info_data["catalog_name"]
Loading

0 comments on commit a051d36

Please sign in to comment.