From f288a55f218661fbe8bcb0b7a6fad5ed13c5ec1b Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 30 Oct 2024 14:11:25 -0400 Subject: [PATCH 1/7] Filter rootnames/filenames before calling the filename_parser --- jwql/utils/constants.py | 1 + jwql/utils/utils.py | 13 +++- .../apps/jwql/archive_database_update.py | 72 ++++++++++++++++++- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/jwql/utils/constants.py b/jwql/utils/constants.py index 37d390a74..37ee98757 100644 --- a/jwql/utils/constants.py +++ b/jwql/utils/constants.py @@ -381,6 +381,7 @@ FILE_PROG_ID_LEN = 5 FILE_SEG_LEN = 3 FILE_SOURCE_ID_LEN = 5 +FILE_SOURCE_ID_LONG_LEN = 9 FILE_TARG_ID_LEN = 3 FILE_VISIT_GRP_LEN = 2 FILE_VISIT_LEN = 3 diff --git a/jwql/utils/utils.py b/jwql/utils/utils.py index d96388d9a..49d060422 100644 --- a/jwql/utils/utils.py +++ b/jwql/utils/utils.py @@ -52,7 +52,7 @@ from jwql.utils.constants import FILE_AC_CAR_ID_LEN, FILE_AC_O_ID_LEN, FILE_ACT_LEN, \ FILE_DATETIME_LEN, FILE_EPOCH_LEN, FILE_GUIDESTAR_ATTMPT_LEN_MIN, \ FILE_GUIDESTAR_ATTMPT_LEN_MAX, FILE_OBS_LEN, FILE_PARALLEL_SEQ_ID_LEN, \ - FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SUFFIX_TYPES, \ + FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SOURCE_ID_LONG_LEN, FILE_SUFFIX_TYPES, \ FILE_TARG_ID_LEN, FILE_VISIT_GRP_LEN, FILE_VISIT_LEN, FILETYPE_WO_STANDARD_SUFFIX, \ JWST_INSTRUMENT_NAMES_SHORTHAND, ON_GITHUB_ACTIONS __location__ = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) @@ -426,6 +426,17 @@ def filename_parser(filename): r"(?P\d{" + f"{FILE_VISIT_LEN}" + "})"\ r"(_.._msa.fits)" + # Stage 2 WFSS source-based files + # e.g. jw06434-c1021_s000001510_nircam_f444w-grismr + #stage_2_source = \ + # r"jw" \ + # r"(?P\d{" + f"{FILE_PROG_ID_LEN}" + "})"\ + # r"-(?P(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\ + # r"_(?P(s)\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + "})"\ + # r"_(?P(nircam|niriss|nirspec|miri|fgs))"\ + # r"_(?P((?!_)[\w-])+)"\ + # r"-" + # Stage 3 filenames with target ID # e.g. "jw80600-o009_t001_miri_f1130w_i2d.fits" stage_3_target_id = \ diff --git a/jwql/website/apps/jwql/archive_database_update.py b/jwql/website/apps/jwql/archive_database_update.py index 748cde5dc..674991f8b 100755 --- a/jwql/website/apps/jwql/archive_database_update.py +++ b/jwql/website/apps/jwql/archive_database_update.py @@ -43,13 +43,20 @@ import logging import os import argparse +import re import numpy as np import django from django.apps import apps from jwql.utils.protect_module import lock_module -from jwql.utils.constants import DEFAULT_MODEL_CHARFIELD +from jwql.utils.constants import (DEFAULT_MODEL_CHARFIELD, + FILE_PROG_ID_LEN, + FILE_AC_O_ID_LEN, + FILE_AC_CAR_ID_LEN, + FILE_SOURCE_ID_LONG_LEN, + FILE_TARG_ID_LEN + ) # These lines are needed in order to use the Django models in a standalone # script (as opposed to code run as a result of a webpage request). If these @@ -113,6 +120,11 @@ def get_updates(update_database): # Get set of unique rootnames all_rootnames = set(['_'.join(f.split('/')[-1].split('_')[:-1]) for f in filenames]) + + # Filter source-based level 2 files out of the rootnames and filenames + all_rootnames = filter_rootnames(all_rootnames) + filenames = filter_filenames(filenames, all_rootnames) + rootnames = [] for rootname in all_rootnames: filename_dict = filename_parser(rootname) @@ -510,6 +522,64 @@ def fill_empty_rootfileinfo(rootfileinfo_set): logging.info(f'\tSaved {saved_rootfileinfos} Root File Infos') +def filter_filenames(fnames, roots): + """Filter out filenames from ``fnames`` that don't match the names in ``roots`` + + Parameters + ---------- + fnames : list + List of filenames + + roots : list + List of rootnames + + Returns + ------- + filtered_fnames : list + Filtered list of filenames + """ + filtered_fnames = [] + for fname in fnames: + for root in roots: + if root in fname: + filtered_fnames.append(fname) + break + return filtered_fnames + + +def filter_rootnames(rootnames): + """Filter out rootnames that we know can't be parsed by the filename_parser. We use this + custom filter here rather than within the filename parser itself because in archive_database_update + we can end up providing thousands of unrecognized filenames (e.g. source-based WFSS files) to + the filename parser, which would result in thousands of logging statments and massive log files. + This way, we filter out the rootnames that obviously won't be parsed before calling the + filename_parser with the rest. jw06434-c1021_s000001510_nircam_f444w-grismr + jw06434-c1021_t000_nircam_clear-f090w_segm.fits + + Parameters + ---------- + rootnames : list + List of rootnames + + Returns + ------- + good_rootnames : list + List of rootnames that do not match the filters + """ + stage_2_source = \ + r"jw" \ + r"(?P\d{" + f"{FILE_PROG_ID_LEN}" + "})"\ + r"-(?P(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\ + r"_(?P(s\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + r"}|(t)\d{" + f"{FILE_TARG_ID_LEN}" + "}))"\ + r"_(?P(nircam|niriss|miri))"\ + r"_(?P((?!_)[\w-])+)"\ + r"-" + + elements = re.compile(stage_2_source) + good_rootnames = [e for e in rootnames if elements.match(e) is None] + return good_rootnames + + @lock_module def protected_code(update_database, fill_empty_list): """Protected code ensures only 1 instance of module will run at any given time From b074376716e8bd3c12f785e2e0898016327adf87 Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 30 Oct 2024 14:19:58 -0400 Subject: [PATCH 2/7] Remove commented out regex --- jwql/utils/utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/jwql/utils/utils.py b/jwql/utils/utils.py index 49d060422..349b5fe0d 100644 --- a/jwql/utils/utils.py +++ b/jwql/utils/utils.py @@ -426,17 +426,6 @@ def filename_parser(filename): r"(?P\d{" + f"{FILE_VISIT_LEN}" + "})"\ r"(_.._msa.fits)" - # Stage 2 WFSS source-based files - # e.g. jw06434-c1021_s000001510_nircam_f444w-grismr - #stage_2_source = \ - # r"jw" \ - # r"(?P\d{" + f"{FILE_PROG_ID_LEN}" + "})"\ - # r"-(?P(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\ - # r"_(?P(s)\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + "})"\ - # r"_(?P(nircam|niriss|nirspec|miri|fgs))"\ - # r"_(?P((?!_)[\w-])+)"\ - # r"-" - # Stage 3 filenames with target ID # e.g. "jw80600-o009_t001_miri_f1130w_i2d.fits" stage_3_target_id = \ From dbad33779f18b2ae29af3b1c64a5b617ce0cf161 Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 30 Oct 2024 14:20:53 -0400 Subject: [PATCH 3/7] Remove unused constant --- jwql/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jwql/utils/utils.py b/jwql/utils/utils.py index 349b5fe0d..d96388d9a 100644 --- a/jwql/utils/utils.py +++ b/jwql/utils/utils.py @@ -52,7 +52,7 @@ from jwql.utils.constants import FILE_AC_CAR_ID_LEN, FILE_AC_O_ID_LEN, FILE_ACT_LEN, \ FILE_DATETIME_LEN, FILE_EPOCH_LEN, FILE_GUIDESTAR_ATTMPT_LEN_MIN, \ FILE_GUIDESTAR_ATTMPT_LEN_MAX, FILE_OBS_LEN, FILE_PARALLEL_SEQ_ID_LEN, \ - FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SOURCE_ID_LONG_LEN, FILE_SUFFIX_TYPES, \ + FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SUFFIX_TYPES, \ FILE_TARG_ID_LEN, FILE_VISIT_GRP_LEN, FILE_VISIT_LEN, FILETYPE_WO_STANDARD_SUFFIX, \ JWST_INSTRUMENT_NAMES_SHORTHAND, ON_GITHUB_ACTIONS __location__ = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) From e125c9813710d3cdb558035c6dddcc179c03e543 Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 15 Jan 2025 10:56:44 -0500 Subject: [PATCH 4/7] Add test for new function --- jwql/tests/test_archive_database_update.py | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 jwql/tests/test_archive_database_update.py diff --git a/jwql/tests/test_archive_database_update.py b/jwql/tests/test_archive_database_update.py new file mode 100644 index 000000000..74568f829 --- /dev/null +++ b/jwql/tests/test_archive_database_update.py @@ -0,0 +1,38 @@ +#! /usr/bin/env python + +"""Tests for the ``archive_database_update`` module. + +Authors +------- + + - Bryan Hilbert + +Use +--- + + These tests can be run via the command line (omit the ``-s`` to + suppress verbose output to stdout): + :: + + pytest -s test_archive_database_update.py +""" + + +import pytest + +from jwql.website.apps.jwql import archive_database_update + + +def test_filter_rootnames(): + """Test the filtering of source-based level 2 files + """ + files = ['jw06434-c1021_s000001510_nircam_f444w-grismr.fits', + 'jw01068004001_02102_00001_nrcb4_rate.fits', + 'jw06434-c1021_t000_nircam_clear-f090w_segm.fits', + 'jw06434-o001_t000_nircam_clear-f090w_segm.fits', + 'jw02183117001_03103_00001-seg001_nrca1_rate.fits'] + + filtered = archive_database_update.filter_rootnames(files) + expected = ['jw01068004001_02102_00001_nrcb4_rate.fits', + 'jw02183117001_03103_00001-seg001_nrca1_rate.fits'] + assert filtered == expected From 79f835a2774082699f851c71b9e66cbde68ba07b Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 15 Jan 2025 12:02:18 -0500 Subject: [PATCH 5/7] Only define FILESYSTEM if not on github actions --- .../apps/jwql/archive_database_update.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/jwql/website/apps/jwql/archive_database_update.py b/jwql/website/apps/jwql/archive_database_update.py index 674991f8b..93d50cee2 100755 --- a/jwql/website/apps/jwql/archive_database_update.py +++ b/jwql/website/apps/jwql/archive_database_update.py @@ -55,27 +55,30 @@ FILE_AC_O_ID_LEN, FILE_AC_CAR_ID_LEN, FILE_SOURCE_ID_LONG_LEN, - FILE_TARG_ID_LEN + FILE_TARG_ID_LEN, + ON_GITHUB_ACTIONS, + ON_READTHEDOCS ) -# These lines are needed in order to use the Django models in a standalone -# script (as opposed to code run as a result of a webpage request). If these -# lines are not run, the script will crash when attempting to import the -# Django models in the line below. -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jwql.website.jwql_proj.settings") -django.setup() - -from jwql.website.apps.jwql.models import Archive, Observation, Proposal, RootFileInfo # noqa -from jwql.utils.constants import JWST_INSTRUMENT_NAMES_MIXEDCASE # noqa -from jwql.utils.logging_functions import log_info, log_fail # noqa -from jwql.utils.monitor_utils import initialize_instrument_monitor # noqa -from jwql.utils.constants import MAST_QUERY_LIMIT # noqa -from jwql.utils.utils import filename_parser, filesystem_path, get_config # noqa -from jwql.website.apps.jwql.data_containers import create_archived_proposals_context # noqa -from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument # noqa -from jwql.website.apps.jwql.data_containers import get_proposal_info, mast_query_filenames_by_instrument, mast_query_by_rootname # noqa - -FILESYSTEM = get_config()['filesystem'] +if not ON_GITHUB_ACTIONS and not ON_READTHEDOCS: + # These lines are needed in order to use the Django models in a standalone + # script (as opposed to code run as a result of a webpage request). If these + # lines are not run, the script will crash when attempting to import the + # Django models in the line below. + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jwql.website.jwql_proj.settings") + django.setup() + + from jwql.website.apps.jwql.models import Archive, Observation, Proposal, RootFileInfo # noqa + from jwql.utils.constants import JWST_INSTRUMENT_NAMES_MIXEDCASE # noqa + from jwql.utils.logging_functions import log_info, log_fail # noqa + from jwql.utils.monitor_utils import initialize_instrument_monitor # noqa + from jwql.utils.constants import MAST_QUERY_LIMIT # noqa + from jwql.utils.utils import filename_parser, filesystem_path, get_config # noqa + from jwql.website.apps.jwql.data_containers import create_archived_proposals_context # noqa + from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument # noqa + from jwql.website.apps.jwql.data_containers import get_proposal_info, mast_query_filenames_by_instrument, mast_query_by_rootname # noqa + + FILESYSTEM = get_config()['filesystem'] @log_info From 365506d0f5179d7197150d568ac1f0801c276d48 Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 15 Jan 2025 12:11:14 -0500 Subject: [PATCH 6/7] Fix import order and location --- .../apps/jwql/archive_database_update.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/jwql/website/apps/jwql/archive_database_update.py b/jwql/website/apps/jwql/archive_database_update.py index 93d50cee2..b9b2db181 100755 --- a/jwql/website/apps/jwql/archive_database_update.py +++ b/jwql/website/apps/jwql/archive_database_update.py @@ -56,9 +56,18 @@ FILE_AC_CAR_ID_LEN, FILE_SOURCE_ID_LONG_LEN, FILE_TARG_ID_LEN, + JWST_INSTRUMENT_NAMES_MIXEDCASE, + MAST_QUERY_LIMIT, ON_GITHUB_ACTIONS, ON_READTHEDOCS ) +from jwql.utils.logging_functions import log_info, log_fail +from jwql.utils.monitor_utils import initialize_instrument_monitor +from jwql.utils.utils import filename_parser, filesystem_path, get_config +from jwql.website.apps.jwql.data_containers import create_archived_proposals_context # noqa +from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument # noqa +from jwql.website.apps.jwql.data_containers import get_proposal_info, mast_query_filenames_by_instrument, mast_query_by_rootname # noqa + if not ON_GITHUB_ACTIONS and not ON_READTHEDOCS: # These lines are needed in order to use the Django models in a standalone @@ -69,15 +78,6 @@ django.setup() from jwql.website.apps.jwql.models import Archive, Observation, Proposal, RootFileInfo # noqa - from jwql.utils.constants import JWST_INSTRUMENT_NAMES_MIXEDCASE # noqa - from jwql.utils.logging_functions import log_info, log_fail # noqa - from jwql.utils.monitor_utils import initialize_instrument_monitor # noqa - from jwql.utils.constants import MAST_QUERY_LIMIT # noqa - from jwql.utils.utils import filename_parser, filesystem_path, get_config # noqa - from jwql.website.apps.jwql.data_containers import create_archived_proposals_context # noqa - from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument # noqa - from jwql.website.apps.jwql.data_containers import get_proposal_info, mast_query_filenames_by_instrument, mast_query_by_rootname # noqa - FILESYSTEM = get_config()['filesystem'] From bcc92d1eddee2f09239d5c83f5eef7dc171efa11 Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 15 Jan 2025 12:12:12 -0500 Subject: [PATCH 7/7] make imports easier to read --- jwql/website/apps/jwql/archive_database_update.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/jwql/website/apps/jwql/archive_database_update.py b/jwql/website/apps/jwql/archive_database_update.py index b9b2db181..6a028563f 100755 --- a/jwql/website/apps/jwql/archive_database_update.py +++ b/jwql/website/apps/jwql/archive_database_update.py @@ -64,9 +64,12 @@ from jwql.utils.logging_functions import log_info, log_fail from jwql.utils.monitor_utils import initialize_instrument_monitor from jwql.utils.utils import filename_parser, filesystem_path, get_config -from jwql.website.apps.jwql.data_containers import create_archived_proposals_context # noqa -from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument # noqa -from jwql.website.apps.jwql.data_containers import get_proposal_info, mast_query_filenames_by_instrument, mast_query_by_rootname # noqa +from jwql.website.apps.jwql.data_containers import create_archived_proposals_context +from jwql.website.apps.jwql.data_containers import get_instrument_proposals, get_filenames_by_instrument +from jwql.website.apps.jwql.data_containers import (get_proposal_info, + mast_query_filenames_by_instrument, + mast_query_by_rootname + ) if not ON_GITHUB_ACTIONS and not ON_READTHEDOCS: