Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter rootnames and filenames before calling the filename_parser in archive_database_update #1657

38 changes: 38 additions & 0 deletions jwql/tests/test_archive_database_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#! /usr/bin/env python

"""Tests for the ``archive_database_update`` module.

Authors
-------

- Bryan Hilbert

Use
---

These tests can be run via the command line (omit the ``-s`` to
suppress verbose output to stdout):
::

pytest -s test_archive_database_update.py
"""


import pytest

from jwql.website.apps.jwql import archive_database_update


def test_filter_rootnames():
"""Test the filtering of source-based level 2 files
"""
files = ['jw06434-c1021_s000001510_nircam_f444w-grismr.fits',
'jw01068004001_02102_00001_nrcb4_rate.fits',
'jw06434-c1021_t000_nircam_clear-f090w_segm.fits',
'jw06434-o001_t000_nircam_clear-f090w_segm.fits',
'jw02183117001_03103_00001-seg001_nrca1_rate.fits']

filtered = archive_database_update.filter_rootnames(files)
expected = ['jw01068004001_02102_00001_nrcb4_rate.fits',
'jw02183117001_03103_00001-seg001_nrca1_rate.fits']
assert filtered == expected
1 change: 1 addition & 0 deletions jwql/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@
FILE_PROG_ID_LEN = 5
FILE_SEG_LEN = 3
FILE_SOURCE_ID_LEN = 5
FILE_SOURCE_ID_LONG_LEN = 9
FILE_TARG_ID_LEN = 3
FILE_VISIT_GRP_LEN = 2
FILE_VISIT_LEN = 3
Expand Down
72 changes: 71 additions & 1 deletion jwql/website/apps/jwql/archive_database_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,20 @@
import logging
import os
import argparse
import re

import numpy as np
import django

from django.apps import apps
from jwql.utils.protect_module import lock_module
from jwql.utils.constants import DEFAULT_MODEL_CHARFIELD
from jwql.utils.constants import (DEFAULT_MODEL_CHARFIELD,
FILE_PROG_ID_LEN,
FILE_AC_O_ID_LEN,
FILE_AC_CAR_ID_LEN,
FILE_SOURCE_ID_LONG_LEN,
FILE_TARG_ID_LEN
)

# These lines are needed in order to use the Django models in a standalone
# script (as opposed to code run as a result of a webpage request). If these
Expand Down Expand Up @@ -113,6 +120,11 @@ def get_updates(update_database):

# Get set of unique rootnames
all_rootnames = set(['_'.join(f.split('/')[-1].split('_')[:-1]) for f in filenames])

# Filter source-based level 2 files out of the rootnames and filenames
all_rootnames = filter_rootnames(all_rootnames)
filenames = filter_filenames(filenames, all_rootnames)

rootnames = []
for rootname in all_rootnames:
filename_dict = filename_parser(rootname)
Expand Down Expand Up @@ -510,6 +522,64 @@ def fill_empty_rootfileinfo(rootfileinfo_set):
logging.info(f'\tSaved {saved_rootfileinfos} Root File Infos')


def filter_filenames(fnames, roots):
"""Filter out filenames from ``fnames`` that don't match the names in ``roots``

Parameters
----------
fnames : list
List of filenames

roots : list
List of rootnames

Returns
-------
filtered_fnames : list
Filtered list of filenames
"""
filtered_fnames = []
for fname in fnames:
for root in roots:
if root in fname:
filtered_fnames.append(fname)
break
return filtered_fnames


def filter_rootnames(rootnames):
"""Filter out rootnames that we know can't be parsed by the filename_parser. We use this
custom filter here rather than within the filename parser itself because in archive_database_update
we can end up providing thousands of unrecognized filenames (e.g. source-based WFSS files) to
the filename parser, which would result in thousands of logging statments and massive log files.
This way, we filter out the rootnames that obviously won't be parsed before calling the
filename_parser with the rest. jw06434-c1021_s000001510_nircam_f444w-grismr
jw06434-c1021_t000_nircam_clear-f090w_segm.fits

Parameters
----------
rootnames : list
List of rootnames

Returns
-------
good_rootnames : list
List of rootnames that do not match the filters
"""
stage_2_source = \
r"jw" \
r"(?P<program_id>\d{" + f"{FILE_PROG_ID_LEN}" + "})"\
r"-(?P<ac_id>(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\
r"_(?P<target_id>(s\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + r"}|(t)\d{" + f"{FILE_TARG_ID_LEN}" + "}))"\
r"_(?P<instrument>(nircam|niriss|miri))"\
r"_(?P<optical_elements>((?!_)[\w-])+)"\
r"-"

elements = re.compile(stage_2_source)
good_rootnames = [e for e in rootnames if elements.match(e) is None]
return good_rootnames


@lock_module
def protected_code(update_database, fill_empty_list):
"""Protected code ensures only 1 instance of module will run at any given time
Expand Down
Loading