Skip to content

Commit

Permalink
Merge pull request #134 from aradhakrishnanGFDL/main
Browse files Browse the repository at this point in the history
Merging main
  • Loading branch information
Ciheim authored Jun 6, 2024
2 parents 794f383 + de35635 commit 3ea963c
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 63 deletions.
4 changes: 2 additions & 2 deletions cats/gfdl_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
},
{
"column_name": "frequency",
"vocabulary": ""
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json"
},
{
"column_name": "modeling_realm",
Expand Down Expand Up @@ -47,7 +47,7 @@
},
{
"column_name": "chunk_freq",
"vocabulary": ""
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json"
},
{
"column_name": "grid_label",
Expand Down
36 changes: 12 additions & 24 deletions configs/config-template.yaml
Original file line number Diff line number Diff line change
@@ -1,41 +1,29 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#The headerlist contains expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "modeling_realm", "table_id","member_id",
"grid_label", "variable_id", "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]


#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.


output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

output_file_template: ['modeling_realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################
#ENTER INPUT PATH HERE (ex: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/)

input_path: ""

#ENTER NAME OF THE CSV AND JSON, WITHOUT THE SUFFIX. (ex: catalog) The builder then generates catalog.csv and catalog.json. This can also be an absolute path)

output_path: ""

input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
1 change: 1 addition & 0 deletions doc/generation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ Catalogs are generated by the following command: *fre catalog buildcatalog <INPU
See `Flags`_ here.

See `Fre-CLI Documentation here <https://ciheim.github.io/fre-cli/>`_

Optional Configuration
----------------------
Expand Down
16 changes: 8 additions & 8 deletions intakebuilder/CSVwriter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import os.path
import csv
from csv import writer
from intakebuilder import builderconfig
def getHeader():
from intakebuilder import builderconfig, configparser

def getHeader(configyaml):
'''
returns header that is the first line in the csv file, refers builderconfig.py
:return: headerlist with all columns
'''
#TODO move headerlist outside in a separate configuration or
#headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
# "frequency", "modeling_realm", "table_id",
# "member_id", "grid_label", "variable_id",
# "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
return builderconfig.headerlist
if configyaml:
return configyaml.headerlist
else:
return builderconfig.headerlist

def writeHeader(csvfile):
'''
writing header for the csv
Expand Down
4 changes: 3 additions & 1 deletion intakebuilder/builderconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

#output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
output_file_template = ['modeling_realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
Expand Down
16 changes: 16 additions & 0 deletions intakebuilder/configparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,19 @@ def __init__(self, config):
print(self.output_path)
except:
raise KeyError("output_path does not exist in config")
try:
self.headerlist = configfile['headerlist']
print(self.headerlist)
except:
raise KeyError("headerlist does not exist in config")
try:
self.output_path_template = configfile['output_path_template']
print(self.output_path_template)
except:
raise KeyError("output_path_template does not exist in config")
try:
self.output_file_template = configfile['output_file_template']
print(self.output_file_template)
except:
raise KeyError("output_file_template does not exist in config")

41 changes: 24 additions & 17 deletions intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
from csv import writer
import os
import xarray as xr
import shutil as sh
from intakebuilder import builderconfig
from intakebuilder import builderconfig, configparser

warning_count = 0;

'''
getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog
Expand Down Expand Up @@ -84,7 +82,7 @@ def getInfoFromFilename(filename,dictInfo,logger):
#adding this back to trace back some old errors
def getInfoFromGFDLFilename(filename,dictInfo,logger):
# 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
if(filename.endswith(".nc")):
if(filename.endswith(".nc")): #and not filename.startswith(".")):
ncfilename = filename.split(".")
varname = ncfilename[-2]
dictInfo["variable_id"] = varname
Expand All @@ -107,7 +105,7 @@ def getInfoFromGFDLFilename(filename,dictInfo,logger):
logger.debug("Filename not compatible with this version of the builder:"+filename)
return dictInfo

def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
'''
Returns info from project directory and the DRS path to the file
:param dirpath:
Expand All @@ -119,31 +117,40 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
# "ensemble_member", "grid_label", "variable",
# "temporal subset", "version", "path"]

#Grab values based on their expected position in path
#Grab values based on their expected position in path
stemdir = dirpath.split("/")
# adding back older versions to ensure we get info from builderconfig
stemdir = dirpath.split("/")
nlen = len(builderconfig.output_path_template)

#lets go backwards and match given input directory to the template, add things to dictInfo
j = -1
cnt = 1
global warning_count
if configyaml:
output_path_template = configyaml.output_path_template
else:
try:
output_path_template = builderconfig.output_path_template
except:
sys.exit("No output_path_template found in builderconfig.py. Check configuration.")

nlen = len(output_path_template)
for i in range(nlen-1,0,-1):
try:
if(builderconfig.output_path_template[i] != "NA"):
dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)]
except:
sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j])
if(output_path_template[i] != "NA"):
try:
dictInfo[output_path_template[i]] = stemdir[(j)]
except IndexError:
print("Check configuration. Is output path template set correctly?")
exit()
except IndexError:
sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+output_path_template[i]+stemdir[j])
j = j - 1
cnt = cnt + 1
# WE do not want to work with anythi:1
# ng that's not time series
#TODO have verbose option to print message

if (dictInfo["cell_methods"] != "ts" and warning_count < 1):
print("Skipping non-timeseries data")
warning_count = 1

if (dictInfo["cell_methods"] != "ts"):
#print("Skipping non-timeseries data")
return {}
return dictInfo
'''
Expand Down
25 changes: 18 additions & 7 deletions intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'}
'''
def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
'''
Craw through the local directory and run through the getInfo.. functions
:param projectdir:
Expand All @@ -19,6 +19,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
pat = re.compile('({}/{}/{}/{})'.format(dictFilter["modeling_realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))

orig_pat = pat

#TODO INCLUDE filter in traversing through directories at the top
for dirpath, dirs, files in os.walk(projectdir):
searchpath = dirpath
Expand All @@ -27,17 +28,23 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
if(pat is not None):
m = re.search(pat, searchpath)
for filename in files:
# get info from filename
filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file

#if filename.startswith("."):
# logger.debug("Skipping hidden file", filepath)
# continue
if not filename.endswith(".nc"):
logger.debug("FILE does not end with .nc. Skipping", filepath)
continue
logger.info(dirpath+"/"+filename)
dictInfo = {}
dictInfo = getinfo.getProject(projectdir, dictInfo)
# get info from filename
filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file
if not filename.endswith(".nc"):
logger.debug("FILE does not end with .nc. Skipping", filepath)
continue
#filepath = os.path.join(dirpath,filename) # 1 AR: Bugfix: this needs to join dirpath and filename to get the full path to the file
dictInfo["path"]=filepath
dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo)
dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
#sys.exit()
list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
list_bad_chunklabel = ['DO_NOT_USE']
Expand All @@ -50,11 +57,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath)
continue

if configyaml:
headerlist = configyaml.headerlist
else:
headerlist = builderconfig.headerlist
# remove those keys that are not CSV headers
# move it so its one time
rmkeys = []
for dkeys in dictInfo.keys():
if dkeys not in builderconfig.headerlist:
if dkeys not in headerlist:
rmkeys.append(dkeys)
rmkeys = list(set(rmkeys))

Expand Down
14 changes: 10 additions & 4 deletions scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,16 @@
@click.option('--append', is_flag=True, default=False)
def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
overwrite=False, append=False):

configyaml = None
# TODO error catching
print("input-path",input_path, config)
if (input_path is None):
#print("input path: ",input_path, " output path: ", output_path)
if input_path is None or output_path is None:
print("No paths given, using yaml configuration")
configyaml = configparser.Config(config)
if configyaml.input_path is None or not configyaml.input_path :
sys.exit("Can't find paths, is yaml configured?")

input_path = configyaml.input_path
output_path = configyaml.output_path

Expand Down Expand Up @@ -78,15 +84,15 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
dictInfo = {}
project_dir = project_dir.rstrip("/")
logger.info("Calling gfdlcrawler.crawlLocal")
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger)
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml)
#Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON
with open(template_path, "r") as jsonTemplate:
data = json.load(jsonTemplate)
data["catalog_file"] = os.path.abspath(csv_path)
jsonFile = open(json_path, "w")
json.dump(data, jsonFile, indent=2)
jsonFile.close()
headers = CSVwriter.getHeader()
headers = CSVwriter.getHeader(configyaml)

# When we pass relative path or just the filename the following still needs to not choke
# so we check if it's a directory first
Expand Down

0 comments on commit 3ea963c

Please sign in to comment.