Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RRFS-SD reader #154

Draft
wants to merge 10 commits into
base: develop
Choose a base branch
from
2 changes: 2 additions & 0 deletions monetio/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
_cesm_se_mm,
_cmaq_mm,
_rrfs_cmaq_mm,
_rrfs_sd_mm,
_wrfchem_mm,
camx,
cmaq,
Expand All @@ -20,6 +21,7 @@
"_cesm_fv_mm",
"_cmaq_mm",
"_rrfs_cmaq_mm",
"_rrfs_sd_mm",
"_wrfchem_mm",
"cmaq",
"camx",
Expand Down
184 changes: 184 additions & 0 deletions monetio/models/_rrfs_sd_mm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
""" RRFS-CMAQ File Reader """
import numpy as np
import xarray as xr
from numpy import concatenate
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved
from pandas import Series
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved


def can_do(index):
"""
+ Determines if the given index can be processed.
+
+ Parameters:
+ index (int): The index to check.
+
+ Returns:
+ bool: True if the index is the maximum value, False otherwise.
+ """
if index.max():
return True
else:
return False

bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved

def open_mfdataset(
fname,
var_list=None,
surf_only=False,
**kwargs,
):
# Like WRF-chem add var list that just determines whether to calculate sums or not to speed this up.
"""Method to open RFFS-SD dyn* netcdf files.

Parameters
----------
fname : string or list
fname is the path to the file or files. It will accept hot keys in
strings as well.
surf_only: boolean
Whether to save only surface data to save on memory and computational
cost (True) or not (False).

Returns
-------
xarray.DataSet
RRFS-SD model dataset in standard format for use in MELODIES-MONET

"""

# Read in all variables and do all calculations.
dset = xr.open_mfdataset(fname, concat_dim="time", combine="nested", **kwargs)

if 'smoke' in dset.data_vars and 'dust' in dset.data_vars:
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved
dset['PM25'] = dset.smoke + dset.dust
dset['PM25'].attrs['long_name'] = 'Particulate Matter < 2.5 microns'
dset['PM25'].attrs['units'] = "ug/kg"
if 'smoke' in dset.data_vars and 'dust' in dset.data_vars and 'coarsepm' in dset.data_vars:
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved
dset['PM10'] = dset.smoke + dset.dust + dset.coarsepm
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved
dset['PM10'].attrs['long_name'] = 'Particulate Matter < 10 microns'
dset['PM10'].attrs['units'] = "ug/kg"

# Standardize some variable names
dset = dset.rename(
{
"grid_yt": "y",
"grid_xt": "x",
"pfull": "z",
"phalf": "z_i", # Interface pressure levels
"lon": "longitude",
"lat": "latitude",
"tmp": "temperature_k", # standard temperature (kelvin)
"pressfc": "surfpres_pa",
"dpres": "dp_pa", # Change names so standard surfpres_pa and dp_pa
"hgtsfc": "surfalt_m",
"delz": "dz_m",
}
)

# Calculate pressure. This has to go before sorting because ak and bk
# are not sorted as they are in attributes
dset["pres_pa_mid"] = _calc_pressure(dset,surf_only)

# Adjust pressure levels for all models such that the surface is first.
dset = dset.sortby("z", ascending=False)
dset = dset.sortby("z_i", ascending=False)

# Note this altitude calcs needs to always go after resorting.
# Altitude calculations are all optional, but for each model add values that are easy to calculate.
dset["alt_msl_m_full"] = _calc_hgt(dset)
dset["dz_m"] = dset["dz_m"] * -1.0 # Change to positive values.

# Set coordinates
dset = dset.reset_index(
["x", "y", "z", "z_i"], drop=True
) # For now drop z_i no variables use it.
dset["latitude"] = dset["latitude"].isel(time=0)
dset["longitude"] = dset["longitude"].isel(time=0)
dset = dset.reset_coords()
dset = dset.set_coords(["latitude", "longitude"])

# These sums and units are quite expensive and memory intensive,
# so add option to shrink dataset to just surface when needed
if surf_only:
dset = dset.isel(z=0).expand_dims("z", axis=1)



# convert "ug/kg to ug/m3"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want this to be optional like Jordan's?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont see why

for i in dset.variables:
if "units" in dset[i].attrs:
if "ug/kg" in dset[i].attrs["units"]:
# ug/kg -> ug/m3 using dry air density
dset[i] = dset[i] * dset["pres_pa_mid"] / dset["temperature_k"] / 287.05535
dset[i].attrs["units"] = r"$\mu g m^{-3}$"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we really want a pretty version here could use unicode, e.g. μg/m³ or μg m⁻³. But maybe better to use just ASCII ug/m3 or ug m-3.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let’s stick with ascii. It may cause issues down the pipeline if we save the raw files out. It might not be recognized as CF convention otherwise

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then I think ug m-3 best for CF (though they would want kg m-3 for canonical).



# Change the times to pandas format
dset["time"] = dset.indexes["time"].to_datetimeindex(unsafe=True)
# Turn off warning for now. This is just because the model is in julian time

return dset

def _calc_hgt(f):
"""Calculates the geopotential height in m from the variables hgtsfc and
delz. Note: To use this function the delz value needs to go from surface
to top of atmosphere in vertical. Because we are adding the height of
each grid box these are really grid top values

Parameters
----------
f : xarray.Dataset
RRFS-CMAQ model data

Returns
-------
xr.DataArray
Geoptential height with attributes.
"""
sfc = f.surfalt_m.load()
dz = f.dz_m.load() * -1.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (loading all dz) is what Maggie's run is failing on, specifically:

Unable to allocate 129. GiB for an array with shape (37, 65, 2961, 4881) and data type float32

Like the pressure calc we should be able to write this in a Dask-friendly way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also like pressure calc, for the surf-only case there is a short version.

# These are negative in RRFS-CMAQ, but you resorted and are adding from the surface,
# so make them positive.
dz[:, 0, :, :] = dz[:, 0, :, :] + sfc # Add the surface altitude to the first model level only
z = dz.rolling(z=len(f.z), min_periods=1).sum()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would think .cumsum() could be used.

z.name = "alt_msl_m_full"
z.attrs["long_name"] = "Altitude MSL Full Layer in Meters"
z.attrs["units"] = "m"
return z


def _calc_pressure(dset, surf_only=False):
"""Calculate the mid-layer pressure in Pa from surface pressure
and ak and bk constants.

Interface pressures are calculated by:
phalf(k) = a(k) + surfpres * b(k)

Mid layer pressures are calculated by:
pfull(k) = (phalf(k+1)-phalf(k))/log(phalf(k+1)/phalf(k))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure where this formula came from

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zmoon still not sure where this formula came from

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rschwant do you know? I think that you originally did this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raffaele sent it to me to describe the vertical structure in the model when I was helping with the boundary conditions. I'll forward you all the email. This is how he described it to me.

Interface pressure levels are computed using the hybrid interface formula:
p(k) = a(k) + ps * b(k)

where ps is the (actual/reference) surface pressure. These pressure levels correspond to phalf in your output dyn*.nc files, while pfull are computed as:
pfull(k) = (phalf(k+1)-phalf(k))/log(phalf(k+1)/phalf(k))

If there is an official typical way of calculating this we can use that instead too. We haven't really tested this much since we have not used the aircraft evaluation in MELDODIES MONET much in the RRFS-CMAQ model.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll compare the two methods' results.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are differences but they are all less than 0.2 hPa in the column I tested.

Differences in Pa:
image

image

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is how Raffaele calculated it for UFS-Aerosols in the exglobal_aero_init function https://github.com/NOAA-EMC/global-workflow/blob/develop/ush/merge_fv3_aerosol_tile.py#L99-L101

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we should use that

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is calculation of delta p, similar to dpres in the dataset but not the same:

image


Parameters
----------
dset : xarray.Dataset
RRFS-CMAQ model data

Returns
-------
xarray.DataArray
Mid-layer pressure with attributes.
"""
p = dset.dp_pa.copy().load() # Have to load into memory here so can assign levels.
srfpres = dset.surfpres_pa.copy().load()
for k in range(len(dset.z)):
if surf_only:
pres_2 = 0.0 + srfpres * 0.9978736
pres_1 = 0.0 + srfpres * 1.0
else:
pres_2 = dset.ak[k + 1] + srfpres * dset.bk[k + 1]
pres_1 = dset.ak[k] + srfpres * dset.bk[k]
p[:, k, :, :] = (pres_2 - pres_1) / np.log(pres_2 / pres_1)
Comment on lines +152 to +161
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe something like this

Suggested change
p = dset.dp_pa.copy().load() # Have to load into memory here so can assign levels.
srfpres = dset.surfpres_pa.copy().load()
for k in range(len(dset.z)):
if surf_only:
pres_2 = 0.0 + srfpres * 0.9978736
pres_1 = 0.0 + srfpres * 1.0
else:
pres_2 = dset.ak[k + 1] + srfpres * dset.bk[k + 1]
pres_1 = dset.ak[k] + srfpres * dset.bk[k]
p[:, k, :, :] = (pres_2 - pres_1) / np.log(pres_2 / pres_1)
surfpres = dset.surfpres_pa
a = dset.ak
b = dset.bk
if surf_only:
phalf_kp1 = 0.0 + surfpres * 0.9978736
phalf_k = 0.0 + surfpres * 1.0
else:
phalf_kp1 = a.shift(z=-1) + surfpres * b.shift(z=-1)
phalf_k = a + surfpres * b
p = (phalf_kp1 - phalf_k) / np.log(phalf_kp1 / phalf_k)

going for consistency with the docstring variable names

Copy link
Member

@zmoon zmoon Dec 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since ak and bk are dataset attrs, not variables

phalf_kp1 = a[1:] + surfpres * b[1:]

but may need to make them DataArrays for the dims to match up properly and such, maybe create

a_k = xr.DataArray(ds.ak[:-1], dims="z")
a_kp1 = xr.DataArray(ds.ak[1:], dims="z")
b_k = xr.DataArray(ds.bk[:-1], dims="z")
b_kp1 = xr.DataArray(ds.bk[1:], dims="z")


p.name = "pres_pa_mid"
p.attrs["units"] = "pa"
p.attrs["long_name"] = "Pressure Mid Layer in Pa"
return p
Loading