-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ESA CCI file checking #53
Changes from all commits
f0f8876
78e3e80
4a7de35
b914fc0
445b0a4
80db5cb
f0ef22c
0a47805
5d1ef42
84ead53
01f8438
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,9 @@ | |
DATE_REGEX = re.compile( | ||
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$" | ||
) | ||
|
||
DATE_REGEX_GENERIC = re.compile( | ||
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{10}$|^\d{12}$|^\d{14}$" | ||
) | ||
|
||
def _get_bounds_var_ids(dct): | ||
return [ | ||
|
@@ -552,6 +554,132 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): | |
|
||
return errors, warnings | ||
|
||
def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, spec_verbose=False, **kwargs): | ||
# Requires yaml file containing a list of file name fields and segregators | ||
# Loop over each file field and segregator until there are no more | ||
# check against defined file extension | ||
|
||
vocab_checks = vocab_checks or {} | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If segregator and extension are dicts, you can use:
|
||
seg = segregator["seg"] | ||
except: | ||
seg='_' | ||
try: | ||
ext = extension["ext"] | ||
except: | ||
ext = '.test' | ||
try: | ||
spec_verb = spec_verbose["spec_verb"] | ||
except: | ||
spec_verb = False | ||
|
||
errors = [] | ||
warnings = [] | ||
|
||
# get filename parts | ||
if not isinstance(file_name,str): | ||
raise ValueError | ||
|
||
extracted_name = file_name.replace(ext,'') | ||
file_name_parts = extracted_name.split(seg) | ||
|
||
if spec_verb: | ||
print(f"File name: {file_name}") | ||
print(f"Segregator: {seg}") | ||
print(f"Extension: {ext}") | ||
print(f"All file name parts: {file_name_parts}") | ||
|
||
# Loop over file name parts | ||
for idx, key in enumerate(file_name_parts): | ||
if spec_verb: | ||
print('') | ||
print(idx, key) | ||
num=f"{idx:02}" | ||
|
||
# Check if number of file name parts matches the number of fields specified in the user-defined yaml file | ||
if len(vocab_checks) < len(file_name_parts): | ||
errors.append( | ||
f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
break | ||
elif len(vocab_checks) > len(file_name_parts): | ||
errors.append( | ||
f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
break | ||
else: | ||
field=vocab_checks["field"+num] | ||
|
||
if field.startswith('__vocabs__') or field.startswith('__URL__'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Repeating the last comment on nested ifs, you can use continue to skip loops where something isn't true rather than having a nested set of conditions. |
||
# VOCAB (config or URL) | ||
if ( | ||
vocabs.check(field, key, spec_verb=spec_verb) | ||
!= [] | ||
): | ||
errors.append( | ||
f"[file name]: Unknown field '{key}' in vocab {field}." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
|
||
elif field.startswith('__date__'): | ||
# DATE REGEX | ||
datefmts=(field.split(":"))[1] | ||
fmts=(datefmts.split(",")) | ||
if spec_verb: | ||
print(f"Valid date formats: {fmts}") | ||
|
||
if not DATE_REGEX_GENERIC.match(key): | ||
errors.append( | ||
f"[file name]: Expecting date/time - bad date format '{key}'" | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
else: | ||
valid_date_found = False | ||
for f in fmts: | ||
try: | ||
t = dt.datetime.strptime(key, f) | ||
valid_date_found = True | ||
break | ||
except ValueError: | ||
pass | ||
if valid_date_found: | ||
if spec_verb: | ||
print(f"Date string {key} matches the required format") | ||
else: | ||
errors.append( | ||
f"[file name]: Invalid date/time string '{key}'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
|
||
elif field.startswith('__version__'): | ||
# FILE/PRODUCT VERSION | ||
verfmt=(field.split(":"))[1] | ||
if re.match(verfmt, key): | ||
if spec_verb: | ||
print(f"File version {key} matches the required format") | ||
else: | ||
errors.append( | ||
f"[file name]: Invalid file version '{key}'. File versions should take the form n{{1,}}[.n{{1,}}]." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
|
||
else: | ||
# FIELD NOT RECOGNISED | ||
errors.append( | ||
f"[file name]: {field} field type not recognised." | ||
) | ||
if spec_verb: | ||
print(errors[-1]) | ||
|
||
return errors, warnings | ||
|
||
def check_radar_moment_variables( | ||
dct, exist_attrs=None, rule_attrs=None, one_of_attrs=None, skip_spellcheck=False | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"field00": ["ESACCI"], | ||
"field01": ["AEROSOL","AIS","BIOMASS","CLOUD","FIRE","GHG","GHRSST","GIS","GLACIERS","HRLC","ICESHEETS","LAKES","LC","LST","OC","OZONE","PERMAFROST","RD","SEAICE","SEALEVEL","SEASTATE","SEASURFACESALINITY","SNOW","SOILMOISTURE","SST","VEGETATION","WATERVAPOUR"], | ||
"field02": ["L0","L1A","L1B","L1C","L2","L2P","L3","L3U","L3C","L3S","L4","IND"] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"Conventions": ["CF-1.5","CF-1.6","CF-1.7","CF-1.8","CF-1.9"], | ||
"project": "Climate Change Initiative - European Space Agency", | ||
"license": "ESA CCI Data Policy: free and open access" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
file-name-format: | ||
func: checksit.generic.check_generic_file_name | ||
params: | ||
vocab_checks: | ||
# ESACCI | ||
field00: __vocabs__:esa-cci-file-name-config:field00 | ||
# CCI Project (e.g. SEAICE) | ||
field01: __vocabs__:esa-cci-file-name-config:field01 | ||
# Processing Level (e.g. L3C) | ||
field02: __vocabs__:esa-cci-file-name-config:field02 | ||
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary | ||
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH) | ||
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# Additional segregator (also stored in the 'product' vocabulary) | ||
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# Date and time | ||
field06: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S | ||
# File version | ||
field07: __version__:^fv\d?\d.?\d?\d?$ | ||
segregator: | ||
seg: '-' | ||
extension: | ||
ext: '.nc' | ||
spec_verbose: | ||
spec_verb: True |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
file-name-format: | ||
func: checksit.generic.check_generic_file_name | ||
params: | ||
vocab_checks: | ||
# Date and time | ||
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S | ||
# ESACCI | ||
field01: __vocabs__:esa-cci-file-name-config:field00 | ||
# Processing Level (e.g. L3C) | ||
field02: __vocabs__:esa-cci-file-name-config:field02 | ||
# CCI Project (e.g. SEAICE) | ||
field03: __vocabs__:esa-cci-file-name-config:field01 | ||
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary | ||
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH) | ||
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# Additional segregator (also stored in the 'product' vocabulary) | ||
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# GDS version | ||
field07: __version__:^v\d?\d.?\d?\d?$ | ||
# File version | ||
field08: __version__:^fv\d?\d.?\d?\d?$ | ||
segregator: | ||
seg: '-' | ||
extension: | ||
ext: '.nc' | ||
spec_verbose: | ||
spec_verb: True |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
file-name-format: | ||
func: checksit.generic.check_generic_file_name | ||
params: | ||
vocab_checks: | ||
# Date and time | ||
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S | ||
# ESACCI | ||
field01: __vocabs__:esa-cci-file-name-config:field00 | ||
# Processing Level (e.g. L3C) | ||
field02: __vocabs__:esa-cci-file-name-config:field02 | ||
# CCI Project (e.g. SEAICE) | ||
field03: __vocabs__:esa-cci-file-name-config:field01 | ||
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary | ||
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH) | ||
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# Additional segregator (also stored in the 'product' vocabulary) | ||
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# File version | ||
field07: __version__:^fv\d?\d.?\d?\d?$ | ||
segregator: | ||
seg: '-' | ||
extension: | ||
ext: '.nc' | ||
spec_verbose: | ||
spec_verb: True |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
file-name-format: | ||
func: checksit.generic.check_generic_file_name | ||
params: | ||
vocab_checks: | ||
# Date and time | ||
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S | ||
# ESACCI | ||
field01: __vocabs__:esa-cci-file-name-config:field00 | ||
# Processing Level (e.g. L3C) | ||
field02: __vocabs__:esa-cci-file-name-config:field02 | ||
# CCI Project (e.g. SEAICE) | ||
field03: __vocabs__:esa-cci-file-name-config:field01 | ||
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary | ||
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json | ||
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH) | ||
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json | ||
# GDS version | ||
field07: __version__:^v\d?\d.?\d?\d?$ | ||
# File version | ||
field08: __version__:^fv\d?\d.?\d?\d?$ | ||
segregator: | ||
seg: '-' | ||
extension: | ||
ext: '.nc' | ||
spec_verbose: | ||
spec_verb: True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure the replace section in here is needed, as it should have been done at the start of the
_load_from_url
function (I can see that this duplication was there before though, so this needs to be checked).