diff --git a/README.md b/README.md index 09bd209..ebf1865 100644 --- a/README.md +++ b/README.md @@ -62,13 +62,15 @@ docker exec -it mast-api python -m src.api.create /code/data/metadata/mini ### Running Unit Tests Verify everything is setup correctly by running the unit tests. -To run the unit tests you may use `pytest` like so: +To run the unit tests, input the following command inside your environment: ```bash -python -m pytest tests +pytest -rsx tests/ --data-path="INSERT FULL PATH TO DATA HERE" ``` -This will run some unit tests for the REST and GraphQL APIs against the data in the database. +The data path will be will be along the lines of `~/fair-mast/data/metadata/mini`. + +This will run some unit tests for the REST and GraphQL APIs against a testing database, created from the data in `--data-path`. ### Uploading Data to the Minio Storage diff --git a/data_creation_for_test.py b/data_creation_for_test.py deleted file mode 100644 index 3e3c5ea..0000000 --- a/data_creation_for_test.py +++ /dev/null @@ -1,134 +0,0 @@ -import pandas as pd -from src.api.create import get_dataset_item_uuid, get_dataset_uuid, read_cpf_metadata, lookup_status_code -from pathlib import Path - -LAST_MAST_SHOT = 30471 - -def create_cpf_summary(url, data_path: Path): - paths = data_path.glob("*_cpf_columns.parquet") - for path in paths: - df = pd.read_parquet(path) - df.to_sql("cpf_summary", url, if_exists="replace") - - -def create_scenarios(url, data_path: Path): - """Create the scenarios metadata table""" - shot_file_name = data_path.parent / "shot_metadata.parquet" - shot_metadata = pd.read_parquet(shot_file_name) - ids = shot_metadata["scenario_id"].unique() - scenarios = shot_metadata["scenario"].unique() - data = pd.DataFrame(dict(id=ids, name=scenarios)).set_index("id") - data = data.dropna() - data.to_sql("scenarios", url, if_exists="append") - - -def create_shots(url, data_path: Path): - """Create the shot metadata table""" - shot_file_name = data_path.parent / "shot_metadata.parquet" - shot_metadata = pd.read_parquet(shot_file_name) - shot_metadata = shot_metadata.loc[shot_metadata["shot_id"] <= LAST_MAST_SHOT] - shot_metadata["facility"] = "MAST" - shot_metadata = shot_metadata.set_index("shot_id", drop=True) - shot_metadata = shot_metadata.sort_index() - shot_metadata["scenario"] = shot_metadata["scenario_id"] - shot_metadata = shot_metadata.drop(["scenario_id", "reference_id"], axis=1) - shot_metadata["uuid"] = shot_metadata.index.map(get_dataset_uuid) - shot_metadata["url"] = ( - "s3://mast/shots/" - + shot_metadata["campaign"] - + "/" - + shot_metadata.index.astype(str) - + ".zarr" - ) - paths = data_path.glob("*_cpf_data.parquet") - cpfs = [] - for path in paths: - cpf_metadata = read_cpf_metadata(path) - cpf_metadata = cpf_metadata.set_index("shot_id", drop=True) - cpf_metadata = cpf_metadata.sort_index() - cpfs.append(cpf_metadata) - cpfs = pd.concat(cpfs, axis=0) - shot_metadata = pd.merge( - shot_metadata, - cpfs, - left_on="shot_id", - right_on="shot_id", - how="inner", - ) - shot_metadata.to_sql("shots", url, if_exists="append") - -def create_signals(url, data_path: Path): - file_names = data_path.glob("signals/**/*.parquet") - file_names = list(file_names) - for file_name in file_names: - signals_metadata = pd.read_parquet(file_name) - signals_metadata = signals_metadata.rename( - columns=dict(shot_nums="shot_id") - ) - if len(signals_metadata) == 0 or "shot_id" not in signals_metadata.columns: - continue - df = signals_metadata - df = df[df.shot_id <= LAST_MAST_SHOT].copy() - df = df.rename({"dataset_item_uuid": "uuid"}, axis=1) - df["uuid"] = [ - get_dataset_item_uuid(item["name"], item["shot_id"]) - for key, item in df.iterrows() - ] - df["quality"] = df["status"].map(lookup_status_code) - df["shape"] = df["shape"].map( - lambda x: x.tolist() if x is not None else None - ) - df["url"] = ( - "s3://mast/shots/M9/" + df["shot_id"].map(str) + ".zarr/" + df["group"] - ) - df["version"] = 0 - df["signal_type"] = df["type"] - if "IMAGE_SUBCLASS" not in df: - df["IMAGE_SUBCLASS"] = None - df["subclass"] = df["IMAGE_SUBCLASS"] - if "format" not in df: - df["format"] = None - if "units" not in df: - df['units'] = '' - columns = [ - "uuid", - "shot_id", - "quality", - "shape", - "name", - "url", - "version", - "units", - "signal_type", - "description", - "subclass", - "format", - ] - df = df[columns] - df = df.set_index("shot_id") - df.to_sql("signals", url, if_exists="append") - - -def create_sources(url, data_path: Path): - source_metadata = pd.read_parquet(data_path.parent / "sources_metadata.parquet") - source_metadata["name"] = source_metadata["source_alias"] - source_metadata["source_type"] = source_metadata["type"] - source_metadata = source_metadata[["description", "name", "source_type"]] - source_metadata = source_metadata.drop_duplicates() - source_metadata = source_metadata.sort_values("name") - source_metadata.to_sql("sources", url, if_exists="append", index=False) - -def create_shot_source_links(url, data_path: Path): - sources_metadata = pd.read_parquet( - data_path.parent / "sources_metadata.parquet" - ) - sources_metadata["source"] = sources_metadata["source_alias"] - sources_metadata["quality"] = sources_metadata["status"].map(lookup_status_code) - sources_metadata["shot_id"] = sources_metadata["shot"].astype(int) - sources_metadata = sources_metadata[ - ["source", "shot_id", "quality", "pass", "format"] - ] - sources_metadata = sources_metadata.sort_values("source") - sources_metadata.to_sql( - "shot_source_link", url, if_exists="append", index=False - ) \ No newline at end of file diff --git a/docs/environment.yml b/docs/environment.yml index c12936c..c9cc953 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -22,4 +22,5 @@ dependencies: - gql - ipykernel - requests_toolbelt + - intake-xarray diff --git a/docs/load_data.ipynb b/docs/load_data.ipynb index 30cc7a8..7fdbb57 100644 --- a/docs/load_data.ipynb +++ b/docs/load_data.ipynb @@ -130,7 +130,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/sljack/miniconda3/envs/mast-book/lib/python3.11/site-packages/intake_xarray/base.py:21: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", + "/opt/homebrew/anaconda3/envs/mast-book/lib/python3.11/site-packages/intake_xarray/base.py:21: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", " 'dims': dict(self._ds.dims),\n" ] }, @@ -520,8 +520,8 @@ " status: 1\n", " time_index: 0\n", " type: Analysed\n", - " units: kA