Merge pull request #134 from aradhakrishnanGFDL/main

Merging main
aradhakrishnanGFDL · Jun 6, 2024 · 3ea963c · 3ea963c
2 parents 794f383 + de35635
commit 3ea963c
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 63 deletions.
diff --git a/cats/gfdl_template.json b/cats/gfdl_template.json
@@ -19,7 +19,7 @@
     },
     {
       "column_name": "frequency",
-      "vocabulary": ""
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json"
     },
     {
       "column_name": "modeling_realm",
@@ -47,7 +47,7 @@
     },
     {
       "column_name": "chunk_freq",
-      "vocabulary": ""
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json"
     },
     {
       "column_name": "grid_label",

diff --git a/configs/config-template.yaml b/configs/config-template.yaml
@@ -1,41 +1,29 @@
-#what kind of directory structure to expect? 
-#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
-# the output_path_template is set as follows.
-#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
-#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
-#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
-#this is a valid value in headerlist as well.
-#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
-#for the fourth value.
-
-#catalog headers
-#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#The headerlist contains expected column names in your catalog/csv file. This is usually determined by the users in conjuction
 #with the ESM collection specification standards and the appropriate workflows.
 
-headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
-                  "frequency", "modeling_realm", "table_id",
-                  "member_id", "grid_label", "variable_id",
-                  "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "modeling_realm", "table_id","member_id",
+            "grid_label", "variable_id", "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
+
 
 #what kind of directory structure to expect?
-#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
-# the output_path_template is set as follows.
 #We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
 #simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
 #above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
 #this is a valid value in headerlist as well.
 #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
 #for the fourth value.
 
+
 output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
 
 output_file_template: ['modeling_realm','temporal_subset','variable_id']
 
-#OUTPUT FILE INFO is currently passed as command-line argument.
-#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
-#csvfile =  #jsonfile =  #logfile =
 
-#######################################################
+#ENTER INPUT PATH HERE (ex: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/)
+
+input_path: ""
+
+#ENTER NAME OF THE CSV AND JSON, WITHOUT THE SUFFIX. (ex: catalog) The builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+
+output_path: ""
 
-input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/doc/generation.rst b/doc/generation.rst
@@ -108,6 +108,7 @@ Catalogs are generated by the following command: *fre catalog buildcatalog <INPU
 
 See `Flags`_ here.
 
+See `Fre-CLI Documentation here <https://ciheim.github.io/fre-cli/>`_
 
 Optional Configuration
 ----------------------

diff --git a/intakebuilder/CSVwriter.py b/intakebuilder/CSVwriter.py
@@ -1,18 +1,18 @@
 import os.path
 import csv
 from csv import writer
-from intakebuilder import builderconfig 
-def getHeader():
+from intakebuilder import builderconfig, configparser 
+
+def getHeader(configyaml):
     '''
     returns header that is the first line in the csv file, refers builderconfig.py
     :return: headerlist with all columns
     '''
-    #TODO move headerlist outside in a separate configuration or 
-    #headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
-    #              "frequency", "modeling_realm", "table_id",
-    #              "member_id", "grid_label", "variable_id",
-    #              "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
-    return builderconfig.headerlist
+    if configyaml:
+        return configyaml.headerlist
+    else:
+        return builderconfig.headerlist
+
 def writeHeader(csvfile):
   '''
   writing header for the csv

diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
@@ -27,8 +27,10 @@
 #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
 #for the fourth value.
 
-output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
 
+#output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
+
+output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
 output_file_template = ['modeling_realm','temporal_subset','variable_id']
 
 #OUTPUT FILE INFO is currently passed as command-line argument.

diff --git a/intakebuilder/configparser.py b/intakebuilder/configparser.py
@@ -15,3 +15,19 @@ def __init__(self, config):
             print(self.output_path)
         except:
             raise KeyError("output_path does not exist in config")
+        try:
+            self.headerlist = configfile['headerlist']
+            print(self.headerlist)
+        except:
+            raise KeyError("headerlist does not exist in config")
+        try:
+            self.output_path_template = configfile['output_path_template']
+            print(self.output_path_template)
+        except:
+            raise KeyError("output_path_template does not exist in config")
+        try:
+            self.output_file_template = configfile['output_file_template']
+            print(self.output_file_template)
+        except:
+            raise KeyError("output_file_template does not exist in config")
+
diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py
@@ -4,10 +4,8 @@
 from csv import writer
 import os
 import xarray as xr
-import shutil as sh
-from intakebuilder import builderconfig 
+from intakebuilder import builderconfig, configparser 
 
-warning_count = 0;
 
 '''
 getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog
@@ -84,7 +82,7 @@ def getInfoFromFilename(filename,dictInfo,logger):
 #adding this back to trace back some old errors
 def getInfoFromGFDLFilename(filename,dictInfo,logger):
     # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
-    if(filename.endswith(".nc")):
+    if(filename.endswith(".nc")): #and not filename.startswith(".")):
         ncfilename = filename.split(".")
         varname = ncfilename[-2]
         dictInfo["variable_id"] = varname
@@ -107,7 +105,7 @@ def getInfoFromGFDLFilename(filename,dictInfo,logger):
         logger.debug("Filename not compatible with this version of the builder:"+filename)
     return dictInfo
 
-def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
+def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
     '''
     Returns info from project directory and the DRS path to the file
     :param dirpath:
@@ -119,31 +117,40 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
    #               "ensemble_member", "grid_label", "variable",
    #               "temporal subset", "version", "path"]
 
-#Grab values based on their expected position in path 
+   #Grab values based on their expected position in path 
     stemdir = dirpath.split("/")
    # adding back older versions to ensure we get info from builderconfig
     stemdir = dirpath.split("/")
-    nlen = len(builderconfig.output_path_template)
+
     #lets go backwards and match given input directory to the template, add things to dictInfo
     j = -1
     cnt = 1
-    global warning_count
+    if configyaml:
+        output_path_template = configyaml.output_path_template
+    else:
+        try:
+            output_path_template = builderconfig.output_path_template 
+        except:
+            sys.exit("No output_path_template found in builderconfig.py. Check configuration.")
+
+    nlen = len(output_path_template) 
     for i in range(nlen-1,0,-1):
       try:
-          if(builderconfig.output_path_template[i] != "NA"):
-             dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)]
-      except:
-          sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j])
+          if(output_path_template[i] != "NA"):
+              try:
+                  dictInfo[output_path_template[i]] = stemdir[(j)]
+              except IndexError:
+                  print("Check configuration. Is output path template set correctly?")
+                  exit()
+      except IndexError:
+          sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+output_path_template[i]+stemdir[j])
       j = j - 1
     cnt = cnt + 1
     # WE do not want to work with anythi:1
     # ng that's not time series
     #TODO have verbose option to print message
-
-    if (dictInfo["cell_methods"] != "ts" and warning_count < 1):
-       print("Skipping non-timeseries data")
-       warning_count = 1
-
+    if (dictInfo["cell_methods"] != "ts"):
+       #print("Skipping non-timeseries data")
        return {}
     return dictInfo
     '''

diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py
@@ -7,7 +7,7 @@
 It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'}
 
 '''
-def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
+def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
     '''
     Craw through the local directory and run through the getInfo.. functions
     :param projectdir:
@@ -19,6 +19,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
         pat = re.compile('({}/{}/{}/{})'.format(dictFilter["modeling_realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))
 
     orig_pat = pat
+
     #TODO INCLUDE filter in traversing through directories at the top
     for dirpath, dirs, files in os.walk(projectdir):
         searchpath = dirpath
@@ -27,17 +28,23 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
         if(pat is not None):
             m = re.search(pat, searchpath)
             for filename in files:
+               # get info from filename
+               filepath = os.path.join(dirpath,filename)  # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file
+
+               #if filename.startswith("."):
+               #    logger.debug("Skipping hidden file", filepath)
+               #    continue
+               if not filename.endswith(".nc"):
+                   logger.debug("FILE does not end with .nc. Skipping", filepath)
+                   continue
                logger.info(dirpath+"/"+filename)
                dictInfo = {}
                dictInfo = getinfo.getProject(projectdir, dictInfo)
                # get info from filename
-               filepath = os.path.join(dirpath,filename)  # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file
-               if not filename.endswith(".nc"):
-                    logger.debug("FILE does not end with .nc. Skipping", filepath)
-                    continue
+               #filepath = os.path.join(dirpath,filename)  # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file
                dictInfo["path"]=filepath
                dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
-               dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo)
+               dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
                #sys.exit()
                list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
                list_bad_chunklabel = ['DO_NOT_USE']
@@ -50,11 +57,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
                        logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath)
                        continue     
 
+               if configyaml:
+                   headerlist = configyaml.headerlist
+               else:
+                   headerlist = builderconfig.headerlist
                # remove those keys that are not CSV headers 
                # move it so its one time 
                rmkeys = []
                for dkeys in dictInfo.keys():
-                  if dkeys not in builderconfig.headerlist:
+                  if dkeys not in headerlist:
                       rmkeys.append(dkeys) 
                rmkeys = list(set(rmkeys))
 

diff --git a/scripts/gen_intake_gfdl.py b/scripts/gen_intake_gfdl.py
@@ -43,10 +43,16 @@
 @click.option('--append', is_flag=True, default=False)
 def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
          overwrite=False, append=False):
+
+    configyaml = None
     # TODO error catching
-    print("input-path",input_path, config)
-    if (input_path is None):
+    #print("input path: ",input_path, " output path: ", output_path)
+    if input_path is None or output_path is None:
+        print("No paths given, using yaml configuration")
         configyaml = configparser.Config(config)
+        if configyaml.input_path is None or not configyaml.input_path :
+            sys.exit("Can't find paths, is yaml configured?")
+
         input_path = configyaml.input_path
         output_path = configyaml.output_path
 
@@ -78,15 +84,15 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     dictInfo = {}
     project_dir = project_dir.rstrip("/")
     logger.info("Calling gfdlcrawler.crawlLocal")
-    list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger)
+    list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml)
     #Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON
     with open(template_path, "r") as jsonTemplate:
         data = json.load(jsonTemplate)
         data["catalog_file"] = os.path.abspath(csv_path)
     jsonFile = open(json_path, "w")
     json.dump(data, jsonFile, indent=2)
     jsonFile.close()
-    headers = CSVwriter.getHeader()
+    headers = CSVwriter.getHeader(configyaml)
 
     # When we pass relative path or just the filename the following still needs to not choke
     # so we check if it's a directory first
-Original file line number
+Diff line change
@@ Expand Up @@
     See `Flags`_ here.
+    See `Fre-CLI Documentation here <https://ciheim.github.io/fre-cli/>`_
     Optional Configuration
     ----------------------
@@ Expand Down @@