diff --git a/catalogbuilder/scripts/__init__.py b/catalogbuilder/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/catalogbuilder/scripts/configs/config-example.yml b/catalogbuilder/scripts/configs/config-example.yml new file mode 100644 index 0000000..2013e59 --- /dev/null +++ b/catalogbuilder/scripts/configs/config-example.yml @@ -0,0 +1,2 @@ +input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) diff --git a/catalogbuilder/scripts/configs/config-template.yaml b/catalogbuilder/scripts/configs/config-template.yaml new file mode 100644 index 0000000..8d04a20 --- /dev/null +++ b/catalogbuilder/scripts/configs/config-template.yaml @@ -0,0 +1,41 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "modeling_realm", "table_id", + "member_id", "grid_label", "variable_id", + "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['modeling_realm','temporal_subset','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py new file mode 100755 index 0000000..a99b667 --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import json +import sys +import click +import os +from pathlib import Path +import logging + +logger = logging.getLogger('local') +logger.setLevel(logging.INFO) + +try: + from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser +except ModuleNotFoundError: + print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") + print("Attempting again with adjusted sys.path ") + try: + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + except: + print("Unable to adjust sys.path") + #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + try: + from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser + except ModuleNotFoundError: + sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") + +package_dir = os.path.dirname(os.path.abspath(__file__)) +template_path = os.path.join(package_dir, '../cats/gfdl_template.json') + +#Setting up argument parsing/flags +@click.command() +#TODO arguments dont have help message. So consider changing arguments to options? +@click.argument('input_path',required=False,nargs=1) +#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp') +@click.argument('output_path',required=False,nargs=1) +#,help='Specify output filename suffix only. e.g. catalog') +@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo') +@click.option('--filter_realm', nargs=1) +@click.option('--filter_freq', nargs=1) +@click.option('--filter_chunk', nargs=1) +@click.option('--overwrite', is_flag=True, default=False) +@click.option('--append', is_flag=True, default=False) +def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None, + overwrite=False, append=False): + + configyaml = None + # TODO error catching + #print("input path: ",input_path, " output path: ", output_path) + if input_path is None or output_path is None: + print("No paths given, using yaml configuration") + configyaml = configparser.Config(config) + if configyaml.input_path is None or not configyaml.input_path : + sys.exit("Can't find paths, is yaml configured?") + + input_path = configyaml.input_path + output_path = configyaml.output_path + + if not os.path.exists(input_path): + sys.exit("Input path does not exist. Adjust configuration.") + if not os.path.exists(Path(output_path).parent.absolute()): + sys.exit("Output path parent directory does not exist. Adjust configuration.") + project_dir = input_path + csv_path = "{0}.csv".format(output_path) + json_path = "{0}.json".format(output_path) + + ######### SEARCH FILTERS ########################### + + dictFilter = {} + dictFilterIgnore = {} + if filter_realm: + dictFilter["modeling_realm"] = filter_realm + if filter_freq: + dictFilter["frequency"] = filter_freq + if filter_chunk: + dictFilter["chunk_freq"] = filter_chunk + + ''' Override config file if necessary for dev + project_dir = "/archive/oar.gfdl.cmip6/ESM4/DECK/ESM4_1pctCO2_D1/gfdl.ncrc4-intel16-prod-openmp/pp/" + #for dev csvfile = "/nbhome/$USER/intakebuilder_cats/intake_gfdl2.csv" + dictFilterIgnore = {} + dictFilter["modeling_realm"]= 'atmos_cmip' + dictFilter["frequency"] = "monthly" + dictFilter["chunk_freq"] = "5yr" + dictFilterIgnore["remove"]= 'DO_NOT_USE' + ''' + ######################################################### + dictInfo = {} + project_dir = project_dir.rstrip("/") + logger.info("Calling gfdlcrawler.crawlLocal") + list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml) + #Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON + with open(template_path, "r") as jsonTemplate: + data = json.load(jsonTemplate) + data["catalog_file"] = os.path.abspath(csv_path) + jsonFile = open(json_path, "w") + json.dump(data, jsonFile, indent=2) + jsonFile.close() + headers = CSVwriter.getHeader(configyaml) + + # When we pass relative path or just the filename the following still needs to not choke + # so we check if it's a directory first + if os.path.isdir(os.path.dirname(csv_path)): + os.makedirs(os.path.dirname(csv_path), exist_ok=True) + CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append) + print("JSON generated at:", os.path.abspath(json_path)) + print("CSV generated at:", os.path.abspath(csv_path)) + logger.info("CSV generated at" + os.path.abspath(csv_path)) + + +if __name__ == '__main__': + main() diff --git a/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb b/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb new file mode 100644 index 0000000..5ec2ff2 --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb @@ -0,0 +1,4829 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f39f9409-ee87-4431-9953-55607daba427", + "metadata": {}, + "source": [ + "This notebook was tested from a GFDL workstation.\n", + "This notebook is an example of using catalog builder from a notebook to generate data catalogs, a.k.a intake-esm catalogs.\n", + "\n", + "How to get here? \n", + "\n", + "Login to your workstation at GFDL.\n", + "module load python/3.9\n", + "conda activate intakebuilder \n", + "(For the above: Note that you can either install your own environment using the following or use an existing environment such as this: conda activate /nbhome/Aparna.Radhakrishnan/conda/envs/intakebuilder )\n", + "\n", + "conda create -n intakebuilder \n", + "conda install intakebuilder -c noaa-gfdl -n intakebuilder\n", + "\n", + "Now, we do a couple of things to make sure your environment is available to jupyter-lab as a kernel.\n", + "\n", + "pip install ipykernel \n", + "python -m ipykernel install --user --name=intakebuilder\n", + "\n", + "Now, start a jupyter-lab session from GFDL workstation: \n", + "\n", + "jupyter-lab \n", + "\n", + "This will give you the URL to the jupyter-lab session running on your localhost. Paste the URL in your web-browser (or via TigerVNC). Paste the notebook cells from this notebook, or locate the notebook from the path where you have downloaded or cloned it via git. Go to Kernel->Change Kernel-> Choose intakebuilder.\n", + "\n", + "Run the notebook and see the results! Extend it and share it with us via a github issue. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "fb3010b8-170f-4462-ad2a-457d1d5415f7", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Found existing file! Overwrite? (y/n) y\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "writing..\n", + "JSON generated at: /home/a1r/mycatalog.json\n", + "CSV generated at: /home/a1r/mycatalog.csv\n" + ] + } + ], + "source": [ + "from scripts import gen_intake_gfdl\n", + "import sys,os\n", + "\n", + "######USER input begins########\n", + "\n", + "#User provides the input directory for which a data catalog needs to be generated.\n", + "#Note that depending on the date and version of the tool, only time-series data are catalogued.\n", + "\n", + "input_path = \"/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/\"\n", + "\n", + "#USER inputs the output path. Based on the following setting, user can expect to see /home/a1r/mycatalog.csv and /home/a1r/mycatalog.json generated as output.\n", + "\n", + "output_path = \"/home/a1r/mycatalog\"\n", + "\n", + "####END OF user input ##########\n", + "sys.argv = ['--INPUT_PATH', input_path, output_path]\n", + "\n", + "try:\n", + " gen_intake_gfdl.main()\n", + "except SystemExit as e:\n", + " if e.code != 0:\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "626eaa1f-d801-4a7d-8fad-2851c9e81070", + "metadata": {}, + "source": [ + "Let's begin our analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "181913cc-4776-4b16-95d6-c6ea1b2cbdad", + "metadata": {}, + "outputs": [], + "source": [ + "import intake_esm, intake\n", + "import matplotlib #do a pip install of tools needed in your env or from the notebook\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6665a48b-a335-4fc2-8130-1a4902a428b0", + "metadata": {}, + "outputs": [], + "source": [ + "pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "0f83dbc3-3dda-4a43-82e9-fb8726b2cda8", + "metadata": {}, + "outputs": [], + "source": [ + "col_url = \"/home/a1r/mycatalog.json\"\n", + "col = intake.open_esm_datastore(col_url)" + ] + }, + { + "cell_type": "markdown", + "id": "344ada01-6716-4fbd-9cee-878ff815d7dd", + "metadata": {}, + "source": [ + "Explore the catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1ce0716e-6667-4aeb-8c4b-50a05643b87f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
activity_idinstitution_idsource_idexperiment_idfrequencymodeling_realmtable_idmember_idgrid_labelvariable_idtemporal_subsetchunk_freqgrid_label.1platformdimensionscell_methodspath
0devNaNam5c96L65_am5f3b1r0_pdclim1850F3hratmos_cmipNaNNaNNaNpr0002010100-00021231231yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
1devNaNam5c96L65_am5f3b1r0_pdclim1850F3hratmos_cmipNaNNaNNaNrlut0002010100-00021231231yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
2devNaNam5c96L65_am5f3b1r0_pdclim1850F3hratmos_cmipNaNNaNNaNpr0003010100-00031231231yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
3devNaNam5c96L65_am5f3b1r0_pdclim1850F3hratmos_cmipNaNNaNNaNrlut0003010100-00031231231yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
4devNaNam5c96L65_am5f3b1r0_pdclim1850F3hratmos_cmipNaNNaNNaNpr0004010100-00041231231yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
......................................................
6405devNaNam5c96L65_am5f3b1r0_pdclim1850Fmonthlyland_cmipNaNNaNNaNtreeFracNdlDcd001001-0010121yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
6406devNaNam5c96L65_am5f3b1r0_pdclim1850Fmonthlyland_cmipNaNNaNNaNtreeFracNdlEvg001001-0010121yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
6407devNaNam5c96L65_am5f3b1r0_pdclim1850Fmonthlyland_cmipNaNNaNNaNtsl001001-0010121yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
6408devNaNam5c96L65_am5f3b1r0_pdclim1850Fmonthlyland_cmipNaNNaNNaNvegFrac001001-0010121yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
6409devNaNam5c96L65_am5f3b1r0_pdclim1850Fmonthlyland_cmipNaNNaNNaNvegHeight001001-0010121yrNaNgfdl.ncrc5-deploy-prod-openmpNaNts/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd...
\n", + "

6410 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " activity_id institution_id source_id experiment_id \\\n", + "0 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "1 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "2 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "3 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "4 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "... ... ... ... ... \n", + "6405 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "6406 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "6407 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "6408 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "6409 dev NaN am5 c96L65_am5f3b1r0_pdclim1850F \n", + "\n", + " frequency modeling_realm table_id member_id grid_label \\\n", + "0 3hr atmos_cmip NaN NaN NaN \n", + "1 3hr atmos_cmip NaN NaN NaN \n", + "2 3hr atmos_cmip NaN NaN NaN \n", + "3 3hr atmos_cmip NaN NaN NaN \n", + "4 3hr atmos_cmip NaN NaN NaN \n", + "... ... ... ... ... ... \n", + "6405 monthly land_cmip NaN NaN NaN \n", + "6406 monthly land_cmip NaN NaN NaN \n", + "6407 monthly land_cmip NaN NaN NaN \n", + "6408 monthly land_cmip NaN NaN NaN \n", + "6409 monthly land_cmip NaN NaN NaN \n", + "\n", + " variable_id temporal_subset chunk_freq grid_label.1 \\\n", + "0 pr 0002010100-0002123123 1yr NaN \n", + "1 rlut 0002010100-0002123123 1yr NaN \n", + "2 pr 0003010100-0003123123 1yr NaN \n", + "3 rlut 0003010100-0003123123 1yr NaN \n", + "4 pr 0004010100-0004123123 1yr NaN \n", + "... ... ... ... ... \n", + "6405 treeFracNdlDcd 001001-001012 1yr NaN \n", + "6406 treeFracNdlEvg 001001-001012 1yr NaN \n", + "6407 tsl 001001-001012 1yr NaN \n", + "6408 vegFrac 001001-001012 1yr NaN \n", + "6409 vegHeight 001001-001012 1yr NaN \n", + "\n", + " platform dimensions cell_methods \\\n", + "0 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "1 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "2 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "3 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "4 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "... ... ... ... \n", + "6405 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "6406 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "6407 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "6408 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "6409 gfdl.ncrc5-deploy-prod-openmp NaN ts \n", + "\n", + " path \n", + "0 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "1 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "2 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "3 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "4 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "... ... \n", + "6405 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "6406 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "6407 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "6408 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "6409 /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... \n", + "\n", + "[6410 rows x 17 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "col.df" + ] + }, + { + "cell_type": "markdown", + "id": "613f8259-a92f-4be5-8268-dfbe225f0670", + "metadata": {}, + "source": [ + "Let's narrow down the search" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "62acbaec-573c-47f9-83bc-015790fd7983", + "metadata": {}, + "outputs": [], + "source": [ + "expname_filter = ['c96L65_am5f3b1r0_pdclim1850F']\n", + "modeling_realm = \"land_cmip\"\n", + "frequency = \"daily\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7fa86782-3f7b-4dbf-80af-0f035003d57f", + "metadata": {}, + "outputs": [], + "source": [ + "cat = col.search(experiment_id=expname_filter,frequency=frequency,modeling_realm=modeling_realm)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6fe2cf2f-e74a-4b50-a099-47c28541878d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'hflsLut', 'mrso', 'mrsos'}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(cat.df[\"variable_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "aa216969-e335-4448-977c-d623a62a697e", + "metadata": {}, + "outputs": [], + "source": [ + "cat = cat.search(variable_id=\"mrso\") #Total Soil Moisture Content" + ] + }, + { + "cell_type": "markdown", + "id": "8542c4e8-07eb-48ba-b466-8e07d3405415", + "metadata": {}, + "source": [ + "dmget the files" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "5227091c-5d83-4b73-a340-22e92124e1f7", + "metadata": {}, + "outputs": [], + "source": [ + "#for simple dmget usage, just use this !dmget {file}\n", + "#use following to wrap the dmget call for each path in the catalog\n", + "def dmgetmagic(x):\n", + " cmd = 'dmget %s'% str(x) \n", + " return os.system(cmd)\n", + "\n", + "#OR refer to importing dmget , https://github.com/aradhakrishnanGFDL/canopy-cats/tree/main/notebooks/dmget.py" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "5eb6b01e-4d68-48ee-904f-dd285be7dee5", + "metadata": {}, + "outputs": [], + "source": [ + "dmstatus = cat.df[\"path\"].apply(dmgetmagic)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "8b50305d-aac1-4df5-add1-fbc9af7773ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--> The keys in the returned dictionary of datasets are constructed as follows:\n", + "\t'source_id.experiment_id.frequency.modeling_realm.variable_id.chunk_freq'\n", + " |████████████████████████████████████████| 100.00% [1/1 00:00<00:00]\r" + ] + } + ], + "source": [ + "dset_dict = cat.to_dataset_dict(cdf_kwargs={'chunks': {'time':5}, 'decode_times': True})" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "f1c27413-e9a7-4855-b9be-1c0b9cf7f4ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "am5.c96L65_am5f3b1r0_pdclim1850F.daily.land_cmip.mrso.1yr\n" + ] + } + ], + "source": [ + "for k in dset_dict.keys(): \n", + " print(k)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "9aae260f-87c8-4d2a-9b55-b9587c1f2309", + "metadata": {}, + "outputs": [], + "source": [ + "ds = dset_dict[\"am5.c96L65_am5f3b1r0_pdclim1850F.daily.land_cmip.mrso.1yr\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "c650221c-714e-4f2e-a53f-ca937c6c38ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 757MB\n",
+       "Dimensions:     (time: 3650, bnds: 2, lat: 180, lon: 288)\n",
+       "Coordinates:\n",
+       "    average_DT  (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T1  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T2  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "  * bnds        (bnds) float64 16B 1.0 2.0\n",
+       "  * lat         (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n",
+       "    lat_bnds    (lat, bnds) float64 3kB dask.array<chunksize=(180, 2), meta=np.ndarray>\n",
+       "  * lon         (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n",
+       "    lon_bnds    (lon, bnds) float64 5kB dask.array<chunksize=(288, 2), meta=np.ndarray>\n",
+       "  * time        (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n",
+       "    time_bnds   (time, bnds) object 58kB dask.array<chunksize=(5, 2), meta=np.ndarray>\n",
+       "Data variables:\n",
+       "    mrso        (time, lat, lon) float32 757MB dask.array<chunksize=(5, 180, 288), meta=np.ndarray>\n",
+       "Attributes: (12/18)\n",
+       "    title:                            c96L65_am5f3b1r0_pdclim1850F\n",
+       "    grid_type:                        regular\n",
+       "    grid_tile:                        N/A\n",
+       "    code_release_version:             2023.01\n",
+       "    git_hash:                         unknown githash\n",
+       "    external_variables:               land_area\n",
+       "    ...                               ...\n",
+       "    intake_esm_attrs:variable_id:     mrso\n",
+       "    intake_esm_attrs:chunk_freq:      1yr\n",
+       "    intake_esm_attrs:platform:        gfdl.ncrc5-deploy-prod-openmp\n",
+       "    intake_esm_attrs:cell_methods:    ts\n",
+       "    intake_esm_attrs:_data_format_:   netcdf\n",
+       "    intake_esm_dataset_key:           am5.c96L65_am5f3b1r0_pdclim1850F.daily....
" + ], + "text/plain": [ + " Size: 757MB\n", + "Dimensions: (time: 3650, bnds: 2, lat: 180, lon: 288)\n", + "Coordinates:\n", + " average_DT (time) timedelta64[ns] 29kB dask.array\n", + " average_T1 (time) object 29kB dask.array\n", + " average_T2 (time) object 29kB dask.array\n", + " * bnds (bnds) float64 16B 1.0 2.0\n", + " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", + " lat_bnds (lat, bnds) float64 3kB dask.array\n", + " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", + " lon_bnds (lon, bnds) float64 5kB dask.array\n", + " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", + " time_bnds (time, bnds) object 58kB dask.array\n", + "Data variables:\n", + " mrso (time, lat, lon) float32 757MB dask.array\n", + "Attributes: (12/18)\n", + " title: c96L65_am5f3b1r0_pdclim1850F\n", + " grid_type: regular\n", + " grid_tile: N/A\n", + " code_release_version: 2023.01\n", + " git_hash: unknown githash\n", + " external_variables: land_area\n", + " ... ...\n", + " intake_esm_attrs:variable_id: mrso\n", + " intake_esm_attrs:chunk_freq: 1yr\n", + " intake_esm_attrs:platform: gfdl.ncrc5-deploy-prod-openmp\n", + " intake_esm_attrs:cell_methods: ts\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: am5.c96L65_am5f3b1r0_pdclim1850F.daily...." + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "84071a21-5f29-4554-99cb-7c02bda9d1f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'mrso' (time: 3650, lat: 180, lon: 288)> Size: 757MB\n",
+       "dask.array<concatenate, shape=(3650, 180, 288), dtype=float32, chunksize=(5, 180, 288), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "    average_DT  (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T1  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T2  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "  * lat         (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n",
+       "  * lon         (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n",
+       "  * time        (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n",
+       "Attributes:\n",
+       "    units:            kg m-2\n",
+       "    long_name:        Total Soil Moisture Content\n",
+       "    cell_methods:     area: mean time: mean\n",
+       "    ocean_fillvalue:  0.0\n",
+       "    cell_measures:    area: land_area\n",
+       "    time_avg_info:    average_T1,average_T2,average_DT\n",
+       "    standard_name:    soil_moisture_content\n",
+       "    interp_method:    conserve_order1
" + ], + "text/plain": [ + " Size: 757MB\n", + "dask.array\n", + "Coordinates:\n", + " average_DT (time) timedelta64[ns] 29kB dask.array\n", + " average_T1 (time) object 29kB dask.array\n", + " average_T2 (time) object 29kB dask.array\n", + " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", + " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", + " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", + "Attributes:\n", + " units: kg m-2\n", + " long_name: Total Soil Moisture Content\n", + " cell_methods: area: mean time: mean\n", + " ocean_fillvalue: 0.0\n", + " cell_measures: area: land_area\n", + " time_avg_info: average_T1,average_T2,average_DT\n", + " standard_name: soil_moisture_content\n", + " interp_method: conserve_order1" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[\"mrso\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "d8e8cd0c-5502-4564-bb12-a269781415ad", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "mrso = ds.mrso.isel(time=1).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "68b4a24c-0720-476b-8061-c42c84608e5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds.mrso.mean(dim='time').plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "9212d429-8cd2-4ef6-a498-2fed900091d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 00020101-00021231\n", + "1 00030101-00031231\n", + "2 00040101-00041231\n", + "3 00050101-00051231\n", + "4 00060101-00061231\n", + "5 00070101-00071231\n", + "6 00080101-00081231\n", + "7 00090101-00091231\n", + "8 00110101-00111231\n", + "9 00100101-00101231\n", + "Name: temporal_subset, dtype: object" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.df['temporal_subset'] " + ] + }, + { + "cell_type": "markdown", + "id": "06746aff-889b-4c67-b2d7-fb5ae821a678", + "metadata": {}, + "source": [ + "Can I please leverage CF? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d7dadd5-7abd-4bf7-a6ca-e39d3c214b04", + "metadata": {}, + "outputs": [], + "source": [ + "pip install cf_xarray" + ] + }, + { + "cell_type": "markdown", + "id": "3f248b8e-2d65-469c-b41f-f1875fac7317", + "metadata": {}, + "source": [ + "#You may leverage the use of cf_xarray, xMIP etc to build your analyses from here. They all blend in." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "c47d02a6-c340-45f6-8f84-f26e691358ca", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import cf_xarray as cfxr" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "c6cb19f4-6409-4e32-9119-b0d51b42eb33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 757MB\n",
+       "Dimensions:     (time: 3650, bnds: 2, lat: 180, lon: 288)\n",
+       "Coordinates:\n",
+       "    average_DT  (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T1  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T2  (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "  * bnds        (bnds) float64 16B 1.0 2.0\n",
+       "  * lat         (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n",
+       "    lat_bnds    (lat, bnds) float64 3kB dask.array<chunksize=(180, 2), meta=np.ndarray>\n",
+       "  * lon         (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n",
+       "    lon_bnds    (lon, bnds) float64 5kB dask.array<chunksize=(288, 2), meta=np.ndarray>\n",
+       "  * time        (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n",
+       "    time_bnds   (time, bnds) object 58kB dask.array<chunksize=(5, 2), meta=np.ndarray>\n",
+       "Data variables:\n",
+       "    mrso        (time, lat, lon) float32 757MB dask.array<chunksize=(5, 180, 288), meta=np.ndarray>\n",
+       "Attributes: (12/18)\n",
+       "    title:                            c96L65_am5f3b1r0_pdclim1850F\n",
+       "    grid_type:                        regular\n",
+       "    grid_tile:                        N/A\n",
+       "    code_release_version:             2023.01\n",
+       "    git_hash:                         unknown githash\n",
+       "    external_variables:               land_area\n",
+       "    ...                               ...\n",
+       "    intake_esm_attrs:variable_id:     mrso\n",
+       "    intake_esm_attrs:chunk_freq:      1yr\n",
+       "    intake_esm_attrs:platform:        gfdl.ncrc5-deploy-prod-openmp\n",
+       "    intake_esm_attrs:cell_methods:    ts\n",
+       "    intake_esm_attrs:_data_format_:   netcdf\n",
+       "    intake_esm_dataset_key:           am5.c96L65_am5f3b1r0_pdclim1850F.daily....
" + ], + "text/plain": [ + " Size: 757MB\n", + "Dimensions: (time: 3650, bnds: 2, lat: 180, lon: 288)\n", + "Coordinates:\n", + " average_DT (time) timedelta64[ns] 29kB dask.array\n", + " average_T1 (time) object 29kB dask.array\n", + " average_T2 (time) object 29kB dask.array\n", + " * bnds (bnds) float64 16B 1.0 2.0\n", + " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", + " lat_bnds (lat, bnds) float64 3kB dask.array\n", + " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", + " lon_bnds (lon, bnds) float64 5kB dask.array\n", + " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", + " time_bnds (time, bnds) object 58kB dask.array\n", + "Data variables:\n", + " mrso (time, lat, lon) float32 757MB dask.array\n", + "Attributes: (12/18)\n", + " title: c96L65_am5f3b1r0_pdclim1850F\n", + " grid_type: regular\n", + " grid_tile: N/A\n", + " code_release_version: 2023.01\n", + " git_hash: unknown githash\n", + " external_variables: land_area\n", + " ... ...\n", + " intake_esm_attrs:variable_id: mrso\n", + " intake_esm_attrs:chunk_freq: 1yr\n", + " intake_esm_attrs:platform: gfdl.ncrc5-deploy-prod-openmp\n", + " intake_esm_attrs:cell_methods: ts\n", + " intake_esm_attrs:_data_format_: netcdf\n", + " intake_esm_dataset_key: am5.c96L65_am5f3b1r0_pdclim1850F.daily...." + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xr.decode_cf(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "0dc03c24-25b6-48f6-9c44-d8bb677244eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'mrso' (time: 0, lat: 180, lon: 288)> Size: 0B\n",
+       "dask.array<getitem, shape=(0, 180, 288), dtype=float32, chunksize=(0, 180, 288), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "    average_DT  (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n",
+       "    average_T1  (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n",
+       "    average_T2  (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n",
+       "  * lat         (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n",
+       "  * lon         (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n",
+       "  * time        (time) float64 0B \n",
+       "Attributes:\n",
+       "    units:            kg m-2\n",
+       "    long_name:        Total Soil Moisture Content\n",
+       "    cell_methods:     area: mean time: mean\n",
+       "    ocean_fillvalue:  0.0\n",
+       "    cell_measures:    area: land_area\n",
+       "    time_avg_info:    average_T1,average_T2,average_DT\n",
+       "    standard_name:    soil_moisture_content\n",
+       "    interp_method:    conserve_order1
" + ], + "text/plain": [ + " Size: 0B\n", + "dask.array\n", + "Coordinates:\n", + " average_DT (time) float64 0B dask.array\n", + " average_T1 (time) float64 0B dask.array\n", + " average_T2 (time) float64 0B dask.array\n", + " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", + " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", + " * time (time) float64 0B \n", + "Attributes:\n", + " units: kg m-2\n", + " long_name: Total Soil Moisture Content\n", + " cell_methods: area: mean time: mean\n", + " ocean_fillvalue: 0.0\n", + " cell_measures: area: land_area\n", + " time_avg_info: average_T1,average_T2,average_DT\n", + " standard_name: soil_moisture_content\n", + " interp_method: conserve_order1" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.mrso.sel(time=slice(\"0002-01-01\",\"0004-01-01\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "4f443874-7a2d-4856-b687-84a8f02a0f83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'time' (time: 3650)> Size: 29kB\n",
+       "array([ 365.5,  366.5,  367.5, ..., 4012.5, 4013.5, 4014.5])\n",
+       "Coordinates:\n",
+       "    average_DT  (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T1  (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "    average_T2  (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n",
+       "  * time        (time) float64 29kB 365.5 366.5 367.5 ... 4.014e+03 4.014e+03\n",
+       "Attributes:\n",
+       "    units:          days since 0001-01-01 00:00:00\n",
+       "    long_name:      time\n",
+       "    axis:           T\n",
+       "    calendar_type:  NOLEAP\n",
+       "    calendar:       noleap\n",
+       "    bounds:         time_bnds\n",
+       "    cell_methods:   time: mean
" + ], + "text/plain": [ + " Size: 29kB\n", + "array([ 365.5, 366.5, 367.5, ..., 4012.5, 4013.5, 4014.5])\n", + "Coordinates:\n", + " average_DT (time) float64 29kB dask.array\n", + " average_T1 (time) float64 29kB dask.array\n", + " average_T2 (time) float64 29kB dask.array\n", + " * time (time) float64 29kB 365.5 366.5 367.5 ... 4.014e+03 4.014e+03\n", + "Attributes:\n", + " units: days since 0001-01-01 00:00:00\n", + " long_name: time\n", + " axis: T\n", + " calendar_type: NOLEAP\n", + " calendar: noleap\n", + " bounds: time_bnds\n", + " cell_methods: time: mean" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.mrso.time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61e9c94-5d20-44d1-9a0a-6dab48dc444c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intakebuilder", + "language": "python", + "name": "intakebuilder" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner.py b/catalogbuilder/scripts/gen_intake_gfdl_runner.py new file mode 100755 index 0000000..920ede8 --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_gfdl_runner.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +from scripts import gen_intake_gfdl +import sys + +input_path = "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path = "test" +sys.argv = ['INPUT_PATH', input_path, output_path] +print(sys.argv) +gen_intake_gfdl.main() + diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py b/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py new file mode 100755 index 0000000..c7e019f --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python + +from scripts import gen_intake_gfdl +import sys + +sys.argv = ['input_path','--config', '/home/a1r/github/CatalogBuilder/scripts/configs/config-example.yml'] +print(sys.argv) +gen_intake_gfdl.main() + diff --git a/catalogbuilder/scripts/gen_intake_local.py b/catalogbuilder/scripts/gen_intake_local.py new file mode 100755 index 0000000..673cd16 --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_local.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import os +from intakebuilder import localcrawler, CSVwriter +import logging +logger = logging.getLogger('local') +hdlr = logging.FileHandler('/nbhome/a1r/logs/local.log') +logger.addHandler(hdlr) +logger.setLevel(logging.INFO) + +def main(): + #######INPUT HERE OR USE FROM A CONFIG FILE LATER###### +# project_dir = "/Users/ar46/data_cmip6/CMIP6/" # DRS COMPLIANT PROJECT DIR + project_dir = "/uda/CMIP6/"# + #CMIP/NOAA-GFDL/GFDL-ESM4/" + csvfile = "/nbhome/a1r/intakebuilder_cats/intake_local.csv" ##"/Users/ar46/PycharmProjects/CatalogBuilder/intakebuilder/test/intake_local.csv" + ####################################################### + ######### SEARCH FILTERS ########################### + dictFilter = {} + dictFilter["source_prefix"]= 'CMIP6/' #CMIP/CMCC/CMCC-CM2-SR5' #'CMIP6/CMIP/' #NOAA-GFDL/GFDL-CM4/' #/CMIP/NOAA-GFDL/GFDL-ESM4/' #Must specify something here, at least the project level + #COMMENT dictFilter["miptable"] = "Amon" #Remove this if you don't want to filter by miptable + #COMMENT dictFilter["varname"] = "tas" #Remove this if you don't want to filter by variable name + ######################################################### + dictInfo = {} + project_dir = project_dir.rstrip("/") + logger.info("Calling localcrawler.crawlLocal") + print("Calling localcrawler.crawlLocal") + list_files = localcrawler.crawlLocal(project_dir, dictFilter, logger) + headers = CSVwriter.getHeader() + if (not os.path.exists(csvfile)): + os.makedirs(os.path.dirname(csvfile), exist_ok=True) + CSVwriter.listdict_to_csv(list_files, headers, csvfile) + print("CSV generated at:", os.path.abspath(csvfile)) + logger.info("CSV generated at"+ os.path.abspath(csvfile)) +if __name__ == '__main__': + main() diff --git a/catalogbuilder/scripts/gen_intake_s3.py b/catalogbuilder/scripts/gen_intake_s3.py new file mode 100755 index 0000000..69a8afb --- /dev/null +++ b/catalogbuilder/scripts/gen_intake_s3.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import os +from intakebuilder import getinfo, s3crawler, CSVwriter +import logging +logger = logging.getLogger('local') +hdlr = logging.FileHandler('/Users/ar46/logs/local.log') +logger.addHandler(hdlr) +logger.setLevel(logging.INFO) + +def main(): + #######INPUT HERE OR USE FROM A CONFIG FILE LATER###### + region = 'us-east-1' #which region is the bucket in? + project_root = 's3://esgf-world/CMIP6/' #DRS Compliant bucket + csvfile = "/Users/ar46/PycharmProjects/CatalogBuilder/intakebuilder/test/intake_s3.csv" + ######### SEARCH FILTERS ########################### + dictFilter = {} + dictFilter["source_prefix"]= 'CMIP6/' #/CMIP/NOAA-GFDL/GFDL-ESM4/' #Must specify something here, at least the project level + #COMMENT dictFilter["miptable"] = "Amon" #Remove this if you don't want to filter by miptable + #COMMENT dictFilter["varname"] = "tas" #Remove this if you don't want to filter by variable name + ####################################################### + project_bucket = project_root.split("/")[1].lstrip("/") + project_name = project_root.split("/")[2] + dictInfo = {} + print(project_root) + project_root = project_root.rstrip("/") + logger.info("Running s3crawler.sss_crawler") + list_files = s3crawler.sss_crawler(project_root,dictFilter, project_root,logger) + print(list_files) + #TODO make search strings a dict for later + #merge project_root and project_bucket as needed + headers = CSVwriter.getHeader() + if (not os.path.exists(csvfile)): + os.makedirs(os.path.dirname(csvfile), exist_ok=True) + CSVwriter.listdict_to_csv(list_files, headers, csvfile) + logger.info("CSV generated at"+ os.path.abspath(csvfile)) + +if __name__ == '__main__': + main() diff --git a/catalogbuilder/scripts/test_catalog.py b/catalogbuilder/scripts/test_catalog.py new file mode 100755 index 0000000..c52e8b6 --- /dev/null +++ b/catalogbuilder/scripts/test_catalog.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import click +import json +from jsondiff import diff +import pandas as pd +import sys + +@click.command() +@click.argument('json_path', nargs = 1 , required = True) +@click.argument('json_template_path', nargs = 1 , required = False) +@click.option('-tf', '--test-failure', is_flag=True, default = False, help="Errors are only printed. Program will not exit.") +def main(json_path,json_template_path,test_failure): + + """ This test ensures catalogs generated by the Catalog Builder tool are minimally valid. This means a few things: the generated catalog JSON file reflects the template it was generated with, the catalog CSV has atleast one row of values (not headers), and each required column exists without any empty values. If a test case is broken or expected to fail, the --test-failure/-tf flag can be used. This flag will simply print errors instead of doing a sys.exit. + + JSON_PATH: Path to generated schema to be tested + + JSON_TEMPLATE_PATH: Path of schema template. Without a given path, cats/gfdl_template.json will be used for comparison """ + + #Open JSON + j = json.load(open(json_path)) + if json_template_path: + json_template = json.load(open(json_template_path)) + else: + json_template = json.load(open('cats/gfdl_template.json')) + + #Validate JSON against JSON template + comp = (diff(j,json_template)) + for key in comp.keys(): + if key != 'catalog_file': + if test_failure: + print(key + ' section of JSON does not refect template') + else: + sys.exit(key + ' section of JSON does not refect template') + + #Get CSV from JSON and open it + csv_path = j["catalog_file"] + catalog = pd.read_csv(csv_path) + + if len(catalog.index) < 1: + if test_failure: + print("Catalog has no values") + else: + sys.exit("Catalog has no values") + + #Get required columns + req = (j["aggregation_control"]["groupby_attrs"]) + + #Check the csv headers for required columns/values + errors = 0 + for column in req: + if column not in catalog.columns: + print(f"The required column '{column}' does not exist in the csv. In other words, there is some inconsistency between the json and the csv file. Please check out info listed under aggregation_control and groupby_attrs in your json file and verify if those columns show up in the csv as well.") + errors += 1 + + if column in catalog.columns: + if(catalog[column].isnull().values.any()): + print(f"'{column}' contains empty values.") + errors += 1 + + if errors > 0: + if test_failure: + print(f"Found {errors} errors.") + else: + sys.exit(f"Found {errors} errors.") + +if __name__ == '__main__': + main() +