Skip to content

Commit

Permalink
Merge pull request #53 from cedadev/esa-cci
Browse files Browse the repository at this point in the history
ESA CCI file checking
  • Loading branch information
joshua-hampton authored Dec 20, 2024
2 parents cd9cbf7 + 01f8438 commit 3ee22df
Show file tree
Hide file tree
Showing 14 changed files with 490 additions and 17 deletions.
2 changes: 1 addition & 1 deletion checksit/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def _check_file(

for spec in specs:
sr = SpecificationChecker(spec)
if "amof-file-name" in spec:
if "file-name" in spec:
spec_errors, spec_warnings = sr.run_checks(
file_content.inpt.split("/")[-1]
)
Expand Down
66 changes: 51 additions & 15 deletions checksit/cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,57 @@ def _load(self, vocab_id):
vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
self._vocabs[vocab_id] = json.load(open(vocab_file))

def _load_from_url(self, vocab_id):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__", "https://")
if (
vocab_id_url.startswith("https://raw.githubusercontent.com")
and "/__latest__/" in vocab_id_url
):
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace(
"raw.githubusercontent.com", "github.com"
)
def _load_from_url_github(self, vocab_id_url: str):
vocab_list = []
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace(
"raw.githubusercontent.com", "github.com"
)
if "/__latest__/" in vocab_id_url:
latest_version = requests.get(
f"{vocab_id_url_base}/releases/latest"
).url.split("/")[-1]
vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
res = requests.get(vocab_id_url.replace("__URL__", "https://"))
if res.status_code == 200:
self._vocabs[vocab_id] = res.json()
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
vocab_list = res.json()

return vocab_list

def _load_from_url_esacci(self, vocab_id_url: str):
vocab_list = []
res = requests.get(vocab_id_url)
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
js = res.json()

if 'dataType' in vocab_id_url:
vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")])
elif 'product' in vocab_id_url:
vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")])
else:
print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}")

return vocab_list

def _load_from_url(self, vocab_id: str):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__", "https://")
if (
vocab_id_url.startswith("https://raw.githubusercontent.com")
):
vocab_list=self._load_from_url_github(vocab_id_url)
elif (
vocab_id_url.startswith("https://vocab.ceda.ac.uk")
):
vocab_list=self._load_from_url_esacci(vocab_id_url)
else:
print(f"[WARNING] Failed to load vocab: {vocab_id}")
print(f"Vocabulary url provided is not recognised: {vocab_id_url}")

self._vocabs[vocab_id] = vocab_list

def __getitem__(self, vocab_id):
# Enables dictionary access to individual vocabulary items
Expand Down Expand Up @@ -85,16 +116,21 @@ def lookup(self, vocab_lookup):

return obj

def check(self, vocab_lookup, value, label="", lookup=True):
def check(self, vocab_lookup, value, label="", lookup=True, spec_verb=False):
# Return a list of errors - empty list if no errors
errors = []
options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0]
if spec_verb:
print(f"Vocab lookup: {vocab_lookup}")

if isinstance(options, list):
if value not in options:
errors.append(
f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')"
)
else:
if spec_verb:
print(f"Value: {value} is in list {options}")
elif isinstance(options, dict):
for key in options.keys():
if key in value.keys():
Expand Down
130 changes: 129 additions & 1 deletion checksit/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
DATE_REGEX = re.compile(
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$"
)

DATE_REGEX_GENERIC = re.compile(
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{10}$|^\d{12}$|^\d{14}$"
)

def _get_bounds_var_ids(dct):
return [
Expand Down Expand Up @@ -552,6 +554,132 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):

return errors, warnings

def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, spec_verbose=False, **kwargs):
# Requires yaml file containing a list of file name fields and segregators
# Loop over each file field and segregator until there are no more
# check against defined file extension

vocab_checks = vocab_checks or {}
try:
seg = segregator["seg"]
except:
seg='_'
try:
ext = extension["ext"]
except:
ext = '.test'
try:
spec_verb = spec_verbose["spec_verb"]
except:
spec_verb = False

errors = []
warnings = []

# get filename parts
if not isinstance(file_name,str):
raise ValueError

extracted_name = file_name.replace(ext,'')
file_name_parts = extracted_name.split(seg)

if spec_verb:
print(f"File name: {file_name}")
print(f"Segregator: {seg}")
print(f"Extension: {ext}")
print(f"All file name parts: {file_name_parts}")

# Loop over file name parts
for idx, key in enumerate(file_name_parts):
if spec_verb:
print('')
print(idx, key)
num=f"{idx:02}"

# Check if number of file name parts matches the number of fields specified in the user-defined yaml file
if len(vocab_checks) < len(file_name_parts):
errors.append(
f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected."
)
if spec_verb:
print(errors[-1])
break
elif len(vocab_checks) > len(file_name_parts):
errors.append(
f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected."
)
if spec_verb:
print(errors[-1])
break
else:
field=vocab_checks["field"+num]

if field.startswith('__vocabs__') or field.startswith('__URL__'):
# VOCAB (config or URL)
if (
vocabs.check(field, key, spec_verb=spec_verb)
!= []
):
errors.append(
f"[file name]: Unknown field '{key}' in vocab {field}."
)
if spec_verb:
print(errors[-1])

elif field.startswith('__date__'):
# DATE REGEX
datefmts=(field.split(":"))[1]
fmts=(datefmts.split(","))
if spec_verb:
print(f"Valid date formats: {fmts}")

if not DATE_REGEX_GENERIC.match(key):
errors.append(
f"[file name]: Expecting date/time - bad date format '{key}'"
)
if spec_verb:
print(errors[-1])
else:
valid_date_found = False
for f in fmts:
try:
t = dt.datetime.strptime(key, f)
valid_date_found = True
break
except ValueError:
pass
if valid_date_found:
if spec_verb:
print(f"Date string {key} matches the required format")
else:
errors.append(
f"[file name]: Invalid date/time string '{key}'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional."
)
if spec_verb:
print(errors[-1])

elif field.startswith('__version__'):
# FILE/PRODUCT VERSION
verfmt=(field.split(":"))[1]
if re.match(verfmt, key):
if spec_verb:
print(f"File version {key} matches the required format")
else:
errors.append(
f"[file name]: Invalid file version '{key}'. File versions should take the form n{{1,}}[.n{{1,}}]."
)
if spec_verb:
print(errors[-1])

else:
# FIELD NOT RECOGNISED
errors.append(
f"[file name]: {field} field type not recognised."
)
if spec_verb:
print(errors[-1])

return errors, warnings

def check_radar_moment_variables(
dct, exist_attrs=None, rule_attrs=None, one_of_attrs=None, skip_spellcheck=False
Expand Down
5 changes: 5 additions & 0 deletions checksit/vocabs/esa-cci-file-name-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"field00": ["ESACCI"],
"field01": ["AEROSOL","AIS","BIOMASS","CLOUD","FIRE","GHG","GHRSST","GIS","GLACIERS","HRLC","ICESHEETS","LAKES","LC","LST","OC","OZONE","PERMAFROST","RD","SEAICE","SEALEVEL","SEASTATE","SEASURFACESALINITY","SNOW","SOILMOISTURE","SST","VEGETATION","WATERVAPOUR"],
"field02": ["L0","L1A","L1B","L1C","L2","L2P","L3","L3U","L3C","L3S","L4","IND"]
}
5 changes: 5 additions & 0 deletions checksit/vocabs/esa-cci-global-attrs-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"Conventions": ["CF-1.5","CF-1.6","CF-1.7","CF-1.8","CF-1.9"],
"project": "Climate Change Initiative - European Space Agency",
"license": "ESA CCI Data Policy: free and open access"
}
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# ESACCI
field00: __vocabs__:esa-cci-file-name-config:field00
# CCI Project (e.g. SEAICE)
field01: __vocabs__:esa-cci-file-name-config:field01
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Date and time
field06: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# File version
field07: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
30 changes: 30 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# GDS version
field07: __version__:^v\d?\d.?\d?\d?$
# File version
field08: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# File version
field07: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# GDS version
field07: __version__:^v\d?\d.?\d?\d?$
# File version
field08: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
Loading

0 comments on commit 3ee22df

Please sign in to comment.