Skip to content

Commit

Permalink
Merge branch 'main' into 9-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
aradhakrishnanGFDL authored Aug 1, 2024
2 parents 448b5be + c949d6b commit 37fd6e9
Show file tree
Hide file tree
Showing 18 changed files with 210 additions and 86 deletions.
15 changes: 2 additions & 13 deletions .github/workflows/conda-env-create-run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,6 @@ jobs:
# install catalogbuilder to conda env directories
$CONDA/envs/catalogbuilder/bin/python -m pip install --prefix $CONDA/envs/catalogbuilder .
- name: Run pytest in catalogbuilder conda environment
run: |
which python
python --version
$CONDA/envs/catalogbuilder/bin/python --version
# which pytest
$CONDA/envs/catalogbuilder/bin/pytest catalogbuilder
- name: Make sample data
run: |
which python
Expand All @@ -57,11 +49,8 @@ jobs:
with:
name: workflow-artifacts1
path: |
gfdl_autotest.csv
gfdl_autotest.json
cats/gfdl_autotest_from_yaml.csv
cats/gfdl_autotest_from_yaml.json
catalogbuilder/cats/gfdl_autotest_from_yaml.json
catalogbuilder/cats/gfdl_autotest_from_yaml.csv
- name: Download all workflow run artifacts
uses: actions/download-artifact@v4

Expand Down
35 changes: 35 additions & 0 deletions .github/workflows/conda-pkg-extra-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: conda-pkg-extra-tests
on:
pull_request:
branches:
# for testing conda build w no upload during PRs
- main
jobs:
build:
runs-on: ubuntu-latest
container:
image: continuumio/miniconda3:latest
steps:
- name: Checkout Files
uses: actions/checkout@v4
- name: Run Docker to Build
run: |
conda config --append channels conda-forge
conda config --append channels noaa-gfdl
conda install conda-build conda-verify
conda build .
- name: Run additional utilities as tests
run: |
conda create --name catalogbuildertest
conda install -n catalogbuildertest catalogbuilder --use-local
/opt/conda/envs/catalogbuildertest/bin/pytest catalogbuilder/tests/test_create_catalog.py
#we will save the output from following alone as manifest
- name: upload-artifacts
uses: actions/upload-artifact@v4
with:
name: workflow-artifacts1
path: |
sample-mdtf-catalog.csv
sample-mdtf-catalog.json
- name: Download all workflow run artifacts
uses: actions/download-artifact@v4
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
recursive-include catalogbuilder/cats *

recursive-include catalogbuilder/intakebuilder/dat *
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)]

See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).


This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
2 changes: 1 addition & 1 deletion catalogbuilder/intakebuilder/builderconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
"time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
Expand Down
10 changes: 10 additions & 0 deletions catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
monthly:
frequency: mon
daily:
frequency: day
hourly:
frequency: 1hr
annual:
frequency: yearly
3hr:
frequency: 3hr
68 changes: 43 additions & 25 deletions catalogbuilder/intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
dictInfo["realm"] = "NA"
return(dictInfo)

def getFreqFromYAML(yamlfile,gfdlfreq=None):
#returns cmip freq for gfdl pp freq
import yaml
cmipfreq = None
with open(yamlfile) as f:
mappings = yaml.load(f, Loader=yaml.FullLoader)
if(gfdlfreq):
try:
cmipfreq = mappings[gfdlfreq]["frequency"]
except KeyError:
cmipfreq = None
return(cmipfreq)

def getStem(dirpath,projectdir):
'''
return stem from the project directory passed and the files crawled within
Expand Down Expand Up @@ -81,31 +94,34 @@ def getInfoFromFilename(filename,dictInfo,logger):
return dictInfo

#adding this back to trace back some old errors
def getInfoFromGFDLFilename(filename,dictInfo,logger):
def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml):
# 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
if(filename.endswith(".nc")): #and not filename.startswith(".")):
ncfilename = filename.split(".")
varname = ncfilename[-2]
dictInfo["variable_id"] = varname
timerange = ncfilename[-3]
dictInfo["time_range"] = timerange
#miptable = "" #ncfilename[1]
#dictInfo["mip_table"] = miptable
#modelname = ncfilename[2]
#dictInfo["model"] = modelname
#expname = ncfilename[3]
#dictInfo["experiment_id"] = expname
#ens = ncfilename[4]
#dictInfo["ensemble_member"] = ens
#grid = ncfilename[5]
#dictInfo["grid_label"] = grid
try:
tsubset = ncfilename[1]
except IndexError:
tsubset = "null" #For fx fields
dictInfo["temporal_subset"] = tsubset
if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ):
stemdir = filename.split(".")
#lets go backwards and match given input directory to the template, add things to dictInfo
j = -2
cnt = 1 #'variable_id': 'static', 'time_range': 'land'}
if configyaml:
output_file_template = configyaml.output_file_template
else:
logger.debug("Filename not compatible with this version of the builder:"+filename)
try:
output_file_template = builderconfig.output_file_template
except:
sys.exit("No output_path_template found. Check configuration.")
nlen = len(output_file_template)
for i in range(nlen-1,-1,-1): #nlen = 3
try:
if(output_file_template[i] != "NA"):
try:
#print(output_file_template[i], "=" , stemdir[(j)])
dictInfo[output_file_template[i]] = stemdir[(j)]
except IndexError:
#print("Check configuration. Is output file template set correctly?")
dictInfo[output_file_template[i]] = ""
except IndexError:
sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j])
j = j - 1
cnt = cnt + 1
return dictInfo

def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
Expand Down Expand Up @@ -241,12 +257,14 @@ def getStandardName(list_variable_id):
#search for variable and its cf name
for variable_id in list_variable_id:
cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
#print(cfname,variable_id)
list_cfname = cfname.tolist()
if not list_cfname:
if(len(list_cfname) == 0):
#print("what if the names correspond to CMOR_varname")
cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
list_cfname = cfname.tolist()
#print(list_cfname)
if len(list_cfname) > 0:
unique_cf = list(set(list_cfname))[0]
dictCF[variable_id] = unique_cf
dictCF[variable_id] = unique_cf
return (dictCF)
13 changes: 11 additions & 2 deletions catalogbuilder/intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
if (op.countOf(filename,".") == 1):
dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger)
else:
dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml)
dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
list_bad_chunklabel = ['DO_NOT_USE']
Expand Down Expand Up @@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
if "standard_name" in missingcols:
dictInfo["standard_name"] = "na"
getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)

#replace frequency as needed
if 'frequency' in dictInfo.keys():
package_dir = os.path.dirname(os.path.abspath(__file__))
yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml')
cmipfreq = None
gfdlfreq = dictInfo['frequency']
cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency'])
if(cmipfreq is not None):
dictInfo['frequency'] = cmipfreq
#print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq)
listfiles.append(dictInfo)
return listfiles
43 changes: 41 additions & 2 deletions catalogbuilder/scripts/configs/config-example.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,41 @@
input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
output_path: "sample-mdtf-catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
39 changes: 20 additions & 19 deletions catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,8 @@
package_dir = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(package_dir, '../cats/gfdl_template.json')

#Setting up argument parsing/flags
@click.command()
#TODO arguments dont have help message. So consider changing arguments to options?
@click.argument('input_path',required=False,nargs=1)
#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
@click.argument('output_path',required=False,nargs=1)
#,help='Specify output filename suffix only. e.g. catalog')
@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
@click.option('--filter_realm', nargs=1)
@click.option('--filter_freq', nargs=1)
@click.option('--filter_chunk', nargs=1)
@click.option('--overwrite', is_flag=True, default=False)
@click.option('--append', is_flag=True, default=False)
@click.option('--slow','-s', is_flag=True, default=False)
def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
overwrite=False, append=False, slow = False):

configyaml = None
# TODO error catching
#print("input path: ",input_path, " output path: ", output_path)
Expand Down Expand Up @@ -86,7 +71,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
dictFilter["chunk_freq"] = "5yr"
dictFilterIgnore["remove"]= 'DO_NOT_USE'
'''
#########################################################
dictInfo = {}
project_dir = project_dir.rstrip("/")
logger.info("Calling gfdlcrawler.crawlLocal")
Expand Down Expand Up @@ -117,7 +101,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
#if(df['variable_id'].eq(k)).any():
df['standard_name'].loc[(df['variable_id'] == k)] = v
#df['standard_name'] = v

if(slow == False) & ('standard_name' in headers):
if ((df is not None) & (len(df) != 0) ):
with open(csv_path, 'w') as csvfile:
Expand All @@ -126,7 +109,25 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
print("JSON generated at:", os.path.abspath(json_path))
print("CSV generated at:", os.path.abspath(csv_path))
logger.info("CSV generated at" + os.path.abspath(csv_path))
return(csv_path,json_path)

#Setting up argument parsing/flags
@click.command()
#TODO arguments dont have help message. So consider changing arguments to options?
@click.argument('input_path',required=False,nargs=1)
#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
@click.argument('output_path',required=False,nargs=1)
#,help='Specify output filename suffix only. e.g. catalog')
@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
@click.option('--filter_realm', nargs=1)
@click.option('--filter_freq', nargs=1)
@click.option('--filter_chunk', nargs=1)
@click.option('--overwrite', is_flag=True, default=False)
@click.option('--append', is_flag=True, default=False)
@click.option('--slow','-s', is_flag=True, default=False)

def create_catalog_cli(**kwargs):
return create_catalog(**kwargs)

if __name__ == '__main__':
main()
create_catalog_cli()
12 changes: 6 additions & 6 deletions catalogbuilder/scripts/gen_intake_gfdl_runner.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python

#TODO test after conda pkg is published and make changes as needed
#from catalogbuilder.scripts import gen_intake_gfdl
from . import gen_intake_gfdl
from catalogbuilder.scripts import gen_intake_gfdl
import sys

input_path = "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
input_path = "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
output_path = "test"
sys.argv = ['INPUT_PATH', input_path, output_path]
print(sys.argv)
gen_intake_gfdl.main()
try:
gen_intake_gfdl.create_catalog(input_path,output_path)
except:
sys.exit("Exception occured calling gen_intake_gfdl.create_catalog")

18 changes: 10 additions & 8 deletions catalogbuilder/scripts/gen_intake_gfdl_runner_config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/usr/bin/env python

#from catalogbuilder.scripts import gen_intake_gfdl
from . import gen_intake_gfdl
import sys

# this will break at some point #TODO
sys.argv = ['input_path','--config', '/home/a1r/github/CatalogBuilder/scripts/configs/config-example.yml']
print(sys.argv)
gen_intake_gfdl.main()
from catalogbuilder.scripts import gen_intake_gfdl
import sys, os

#This is an example call to run catalog builder using a yaml config file.
package_dir = os.path.dirname(os.path.abspath(__file__))
configyaml = os.path.join(package_dir, 'configs/config-example.yml')

def create_catalog_from_config(config=configyaml):
csv, json = gen_intake_gfdl.create_catalog(config=configyaml)
return(csv,json)

2 changes: 1 addition & 1 deletion catalogbuilder/tests/config-cfname.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id']
#######################################################

input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
Loading

0 comments on commit 37fd6e9

Please sign in to comment.