Merge branch 'main' into 9-docs

aradhakrishnanGFDL · Aug 1, 2024 · 37fd6e9 · 37fd6e9
2 parents 448b5be + c949d6b
commit 37fd6e9
Show file tree

Hide file tree

Showing 18 changed files with 210 additions and 86 deletions.
diff --git a/.github/workflows/conda-env-create-run-pytest.yml b/.github/workflows/conda-env-create-run-pytest.yml
@@ -29,14 +29,6 @@ jobs:
         # install catalogbuilder to conda env directories
         $CONDA/envs/catalogbuilder/bin/python -m pip install --prefix $CONDA/envs/catalogbuilder .
 
-    - name: Run pytest in catalogbuilder conda environment
-      run: |
-        which python
-        python --version
-        $CONDA/envs/catalogbuilder/bin/python --version
-        # which pytest
-        $CONDA/envs/catalogbuilder/bin/pytest catalogbuilder
-
     - name: Make sample data
       run: |
         which python
@@ -57,11 +49,8 @@ jobs:
       with:
           name: workflow-artifacts1 
           path: |
-            gfdl_autotest.csv
-            gfdl_autotest.json
-            cats/gfdl_autotest_from_yaml.csv
-            cats/gfdl_autotest_from_yaml.json
-
+            catalogbuilder/cats/gfdl_autotest_from_yaml.json
+            catalogbuilder/cats/gfdl_autotest_from_yaml.csv 
     - name: Download all workflow run artifacts
       uses: actions/download-artifact@v4
 

diff --git a/.github/workflows/conda-pkg-extra-tests.yml b/.github/workflows/conda-pkg-extra-tests.yml
@@ -0,0 +1,35 @@
+name: conda-pkg-extra-tests
+on:
+  pull_request:
+    branches:
+    # for testing conda build w no upload during PRs
+    - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container:
+      image: continuumio/miniconda3:latest
+    steps:
+    - name: Checkout Files
+      uses: actions/checkout@v4
+    - name: Run Docker to Build
+      run: |
+        conda config --append channels conda-forge
+        conda config --append channels noaa-gfdl
+        conda install conda-build conda-verify
+        conda build .
+    - name: Run additional utilities as tests
+      run: |
+        conda create --name catalogbuildertest 
+        conda install -n catalogbuildertest catalogbuilder --use-local
+        /opt/conda/envs/catalogbuildertest/bin/pytest catalogbuilder/tests/test_create_catalog.py
+        #we will save the output from following alone as manifest
+    - name: upload-artifacts
+      uses: actions/upload-artifact@v4
+      with:
+          name: workflow-artifacts1 
+          path: |
+            sample-mdtf-catalog.csv
+            sample-mdtf-catalog.json
+    - name: Download all workflow run artifacts
+      uses: actions/download-artifact@v4
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
 recursive-include catalogbuilder/cats *
-
+recursive-include catalogbuilder/intakebuilder/dat *
diff --git a/README.md b/README.md
@@ -4,5 +4,4 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)]
 
 See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).
 
-
 This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
diff --git a/catalogbuilder/intakebuilder/builderconfig.py b/catalogbuilder/intakebuilder/builderconfig.py
@@ -15,7 +15,7 @@
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
                   "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
-                  "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
+                  "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
 #what kind of directory structure to expect?
 #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp

diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml
@@ -0,0 +1,10 @@
+monthly:
+    frequency: mon
+daily:
+    frequency: day 
+hourly:
+    frequency: 1hr
+annual:
+    frequency: yearly
+3hr:
+    frequency: 3hr 
diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py
@@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
                 dictInfo["realm"]  = "NA"
     return(dictInfo)
 
+def getFreqFromYAML(yamlfile,gfdlfreq=None):
+    #returns cmip freq for gfdl pp freq 
+    import yaml
+    cmipfreq = None
+    with open(yamlfile) as f:
+        mappings = yaml.load(f, Loader=yaml.FullLoader)
+        if(gfdlfreq):
+            try:
+                cmipfreq = mappings[gfdlfreq]["frequency"]
+            except KeyError:
+                cmipfreq = None 
+    return(cmipfreq)
+
 def getStem(dirpath,projectdir):
     '''
     return stem from the project directory passed and the files crawled within
@@ -81,31 +94,34 @@ def getInfoFromFilename(filename,dictInfo,logger):
     return dictInfo
 
 #adding this back to trace back some old errors
-def getInfoFromGFDLFilename(filename,dictInfo,logger):
+def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml):
     # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
-    if(filename.endswith(".nc")): #and not filename.startswith(".")):
-        ncfilename = filename.split(".")
-        varname = ncfilename[-2]
-        dictInfo["variable_id"] = varname
-        timerange = ncfilename[-3]
-        dictInfo["time_range"] = timerange
-        #miptable = "" #ncfilename[1]
-        #dictInfo["mip_table"] = miptable
-        #modelname = ncfilename[2]
-        #dictInfo["model"] = modelname
-        #expname = ncfilename[3]
-        #dictInfo["experiment_id"] = expname
-        #ens = ncfilename[4]
-        #dictInfo["ensemble_member"] = ens
-        #grid = ncfilename[5]
-        #dictInfo["grid_label"] = grid
-        try:
-           tsubset = ncfilename[1]
-        except IndexError:
-           tsubset = "null" #For fx fields
-        dictInfo["temporal_subset"] = tsubset
+  if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ): 
+    stemdir = filename.split(".")
+    #lets go backwards and match given input directory to the template, add things to dictInfo
+    j = -2
+    cnt = 1 #'variable_id': 'static', 'time_range': 'land'}
+    if configyaml:
+        output_file_template = configyaml.output_file_template
     else:
-        logger.debug("Filename not compatible with this version of the builder:"+filename)
+        try:
+            output_file_template = builderconfig.output_file_template
+        except:
+            sys.exit("No output_path_template found. Check configuration.")
+    nlen = len(output_file_template)
+    for i in range(nlen-1,-1,-1): #nlen = 3
+      try:
+          if(output_file_template[i] != "NA"):
+              try:
+                  #print(output_file_template[i], "=" , stemdir[(j)])
+                  dictInfo[output_file_template[i]] = stemdir[(j)]
+              except IndexError:
+                  #print("Check configuration. Is output file template set correctly?")
+                  dictInfo[output_file_template[i]] = ""
+      except IndexError:
+          sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j])
+      j = j - 1
+    cnt = cnt + 1
     return dictInfo
 
 def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
@@ -241,12 +257,14 @@ def getStandardName(list_variable_id):
   #search for variable and its cf name
   for variable_id in list_variable_id:
      cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
+     #print(cfname,variable_id)
      list_cfname = cfname.tolist()
-     if not list_cfname:
+     if(len(list_cfname) == 0):
         #print("what if the names correspond to CMOR_varname")
         cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
         list_cfname = cfname.tolist()
+        #print(list_cfname)
      if len(list_cfname) > 0:
        unique_cf = list(set(list_cfname))[0]
-     dictCF[variable_id] = unique_cf
+       dictCF[variable_id] = unique_cf
   return (dictCF)
diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py
@@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
                if (op.countOf(filename,".") == 1):
                  dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger)
                else:
-                 dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
+                 dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml)
                dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
                list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
                list_bad_chunklabel = ['DO_NOT_USE']
@@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
                     if "standard_name" in missingcols: 
                         dictInfo["standard_name"] = "na"
                         getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)
-
+               #replace frequency as needed 
+               if 'frequency' in dictInfo.keys():
+                      package_dir = os.path.dirname(os.path.abspath(__file__))
+                      yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml')
+                      cmipfreq = None
+                      gfdlfreq = dictInfo['frequency']  
+                      cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency'])
+                      if(cmipfreq is not None):
+                          dictInfo['frequency'] = cmipfreq 
+                          #print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq)  
                listfiles.append(dictInfo)
     return listfiles
diff --git a/catalogbuilder/scripts/configs/config-example.yml b/catalogbuilder/scripts/configs/config-example.yml
@@ -1,2 +1,41 @@
-input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
+output_path: "sample-mdtf-catalog"  # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -29,23 +29,8 @@
 package_dir = os.path.dirname(os.path.abspath(__file__))
 template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
 
-#Setting up argument parsing/flags
-@click.command()
-#TODO arguments dont have help message. So consider changing arguments to options?
-@click.argument('input_path',required=False,nargs=1)
-#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
-@click.argument('output_path',required=False,nargs=1)
-#,help='Specify output filename suffix only. e.g. catalog')
-@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
-@click.option('--filter_realm', nargs=1)
-@click.option('--filter_freq', nargs=1)
-@click.option('--filter_chunk', nargs=1)
-@click.option('--overwrite', is_flag=True, default=False)
-@click.option('--append', is_flag=True, default=False)
-@click.option('--slow','-s', is_flag=True, default=False)
-def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
+def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
          overwrite=False, append=False, slow = False):
-
     configyaml = None
     # TODO error catching
     #print("input path: ",input_path, " output path: ", output_path)
@@ -86,7 +71,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     dictFilter["chunk_freq"] = "5yr"
     dictFilterIgnore["remove"]= 'DO_NOT_USE'
     '''
-    #########################################################
     dictInfo = {}
     project_dir = project_dir.rstrip("/")
     logger.info("Calling gfdlcrawler.crawlLocal")
@@ -117,7 +101,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
                          #if(df['variable_id'].eq(k)).any():
                          df['standard_name'].loc[(df['variable_id'] == k)] = v
                              #df['standard_name'] = v 
-
     if(slow == False) & ('standard_name' in headers):
        if ((df is not None) & (len(df) != 0) ):
            with open(csv_path, 'w') as csvfile:
@@ -126,7 +109,25 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     print("JSON generated at:", os.path.abspath(json_path))
     print("CSV generated at:", os.path.abspath(csv_path))
     logger.info("CSV generated at" + os.path.abspath(csv_path))
+    return(csv_path,json_path)
 
+#Setting up argument parsing/flags
+@click.command()
+#TODO arguments dont have help message. So consider changing arguments to options?
+@click.argument('input_path',required=False,nargs=1)
+#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
+@click.argument('output_path',required=False,nargs=1)
+#,help='Specify output filename suffix only. e.g. catalog')
+@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
+@click.option('--filter_realm', nargs=1)
+@click.option('--filter_freq', nargs=1)
+@click.option('--filter_chunk', nargs=1)
+@click.option('--overwrite', is_flag=True, default=False)
+@click.option('--append', is_flag=True, default=False)
+@click.option('--slow','-s', is_flag=True, default=False)
 
+def create_catalog_cli(**kwargs):
+    return create_catalog(**kwargs)
+
 if __name__ == '__main__':
-    main()
+    create_catalog_cli()
diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner.py b/catalogbuilder/scripts/gen_intake_gfdl_runner.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 
 #TODO test after conda pkg is published and make changes as needed 
-#from catalogbuilder.scripts import gen_intake_gfdl
-from . import gen_intake_gfdl
+from catalogbuilder.scripts import gen_intake_gfdl
 import sys
 
-input_path = "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
+input_path = "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
 output_path = "test"
-sys.argv = ['INPUT_PATH', input_path, output_path]
-print(sys.argv)
-gen_intake_gfdl.main()
+try:
+  gen_intake_gfdl.create_catalog(input_path,output_path)
+except:
+  sys.exit("Exception occured calling gen_intake_gfdl.create_catalog")
 
diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py b/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 
-#from catalogbuilder.scripts import gen_intake_gfdl
-from . import gen_intake_gfdl
-import sys
-
-# this will break at some point #TODO
-sys.argv = ['input_path','--config', '/home/a1r/github/CatalogBuilder/scripts/configs/config-example.yml']
-print(sys.argv)
-gen_intake_gfdl.main()
+from catalogbuilder.scripts import gen_intake_gfdl
+import sys, os 
+
+#This is an example call to run catalog builder using a yaml config file.
+package_dir = os.path.dirname(os.path.abspath(__file__))
+configyaml = os.path.join(package_dir, 'configs/config-example.yml')
+
+def create_catalog_from_config(config=configyaml):
+    csv, json = gen_intake_gfdl.create_catalog(config=configyaml)
+    return(csv,json)
 
diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml
@@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id']
 #######################################################
 
 input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		recursive-include catalogbuilder/cats *

		recursive-include catalogbuilder/intakebuilder/dat *
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,5 +4,4 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)]

		See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).


		This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!