Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ESA CCI file checking #53

Merged
merged 11 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion checksit/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def _check_file(

for spec in specs:
sr = SpecificationChecker(spec)
if "amof-file-name" in spec:
if "file-name" in spec:
spec_errors, spec_warnings = sr.run_checks(
file_content.inpt.split("/")[-1]
)
Expand Down
66 changes: 51 additions & 15 deletions checksit/cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,57 @@ def _load(self, vocab_id):
vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
self._vocabs[vocab_id] = json.load(open(vocab_file))

def _load_from_url(self, vocab_id):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__", "https://")
if (
vocab_id_url.startswith("https://raw.githubusercontent.com")
and "/__latest__/" in vocab_id_url
):
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace(
"raw.githubusercontent.com", "github.com"
)
def _load_from_url_github(self, vocab_id_url: str):
vocab_list = []
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace(
"raw.githubusercontent.com", "github.com"
)
if "/__latest__/" in vocab_id_url:
latest_version = requests.get(
f"{vocab_id_url_base}/releases/latest"
).url.split("/")[-1]
vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
res = requests.get(vocab_id_url.replace("__URL__", "https://"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure the replace section in here is needed, as it should have been done at the start of the _load_from_url function (I can see that this duplication was there before though, so this needs to be checked).

if res.status_code == 200:
self._vocabs[vocab_id] = res.json()
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
vocab_list = res.json()

return vocab_list

def _load_from_url_esacci(self, vocab_id_url: str):
vocab_list = []
res = requests.get(vocab_id_url)
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
js = res.json()

if 'dataType' in vocab_id_url:
vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")])
elif 'product' in vocab_id_url:
vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")])
else:
print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}")

return vocab_list

def _load_from_url(self, vocab_id: str):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__", "https://")
if (
vocab_id_url.startswith("https://raw.githubusercontent.com")
):
vocab_list=self._load_from_url_github(vocab_id_url)
elif (
vocab_id_url.startswith("https://vocab.ceda.ac.uk")
):
vocab_list=self._load_from_url_esacci(vocab_id_url)
else:
print(f"[WARNING] Failed to load vocab: {vocab_id}")
print(f"Vocabulary url provided is not recognised: {vocab_id_url}")

self._vocabs[vocab_id] = vocab_list

def __getitem__(self, vocab_id):
# Enables dictionary access to individual vocabulary items
Expand Down Expand Up @@ -85,16 +116,21 @@ def lookup(self, vocab_lookup):

return obj

def check(self, vocab_lookup, value, label="", lookup=True):
def check(self, vocab_lookup, value, label="", lookup=True, spec_verb=False):
# Return a list of errors - empty list if no errors
errors = []
options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0]
if spec_verb:
print(f"Vocab lookup: {vocab_lookup}")

if isinstance(options, list):
if value not in options:
errors.append(
f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')"
)
else:
if spec_verb:
print(f"Value: {value} is in list {options}")
elif isinstance(options, dict):
for key in options.keys():
if key in value.keys():
Expand Down
130 changes: 129 additions & 1 deletion checksit/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
DATE_REGEX = re.compile(
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$"
)

DATE_REGEX_GENERIC = re.compile(
r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{10}$|^\d{12}$|^\d{14}$"
)

def _get_bounds_var_ids(dct):
return [
Expand Down Expand Up @@ -552,6 +554,132 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):

return errors, warnings

def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, spec_verbose=False, **kwargs):
# Requires yaml file containing a list of file name fields and segregators
# Loop over each file field and segregator until there are no more
# check against defined file extension

vocab_checks = vocab_checks or {}
try:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If segregator and extension are dicts, you can use:

seg = segregator.get("seg", "_") # Where the second parameter is the default return value in the case where the first one is not present in the dict

seg = segregator["seg"]
except:
seg='_'
try:
ext = extension["ext"]
except:
ext = '.test'
try:
spec_verb = spec_verbose["spec_verb"]
except:
spec_verb = False

errors = []
warnings = []

# get filename parts
if not isinstance(file_name,str):
raise ValueError

extracted_name = file_name.replace(ext,'')
file_name_parts = extracted_name.split(seg)

if spec_verb:
print(f"File name: {file_name}")
print(f"Segregator: {seg}")
print(f"Extension: {ext}")
print(f"All file name parts: {file_name_parts}")

# Loop over file name parts
for idx, key in enumerate(file_name_parts):
if spec_verb:
print('')
print(idx, key)
num=f"{idx:02}"

# Check if number of file name parts matches the number of fields specified in the user-defined yaml file
if len(vocab_checks) < len(file_name_parts):
errors.append(
f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected."
)
if spec_verb:
print(errors[-1])
break
elif len(vocab_checks) > len(file_name_parts):
errors.append(
f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected."
)
if spec_verb:
print(errors[-1])
break
else:
field=vocab_checks["field"+num]

if field.startswith('__vocabs__') or field.startswith('__URL__'):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeating the last comment on nested ifs, you can use continue to skip loops where something isn't true rather than having a nested set of conditions.

# VOCAB (config or URL)
if (
vocabs.check(field, key, spec_verb=spec_verb)
!= []
):
errors.append(
f"[file name]: Unknown field '{key}' in vocab {field}."
)
if spec_verb:
print(errors[-1])

elif field.startswith('__date__'):
# DATE REGEX
datefmts=(field.split(":"))[1]
fmts=(datefmts.split(","))
if spec_verb:
print(f"Valid date formats: {fmts}")

if not DATE_REGEX_GENERIC.match(key):
errors.append(
f"[file name]: Expecting date/time - bad date format '{key}'"
)
if spec_verb:
print(errors[-1])
else:
valid_date_found = False
for f in fmts:
try:
t = dt.datetime.strptime(key, f)
valid_date_found = True
break
except ValueError:
pass
if valid_date_found:
if spec_verb:
print(f"Date string {key} matches the required format")
else:
errors.append(
f"[file name]: Invalid date/time string '{key}'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional."
)
if spec_verb:
print(errors[-1])

elif field.startswith('__version__'):
# FILE/PRODUCT VERSION
verfmt=(field.split(":"))[1]
if re.match(verfmt, key):
if spec_verb:
print(f"File version {key} matches the required format")
else:
errors.append(
f"[file name]: Invalid file version '{key}'. File versions should take the form n{{1,}}[.n{{1,}}]."
)
if spec_verb:
print(errors[-1])

else:
# FIELD NOT RECOGNISED
errors.append(
f"[file name]: {field} field type not recognised."
)
if spec_verb:
print(errors[-1])

return errors, warnings

def check_radar_moment_variables(
dct, exist_attrs=None, rule_attrs=None, one_of_attrs=None, skip_spellcheck=False
Expand Down
5 changes: 5 additions & 0 deletions checksit/vocabs/esa-cci-file-name-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"field00": ["ESACCI"],
"field01": ["AEROSOL","AIS","BIOMASS","CLOUD","FIRE","GHG","GHRSST","GIS","GLACIERS","HRLC","ICESHEETS","LAKES","LC","LST","OC","OZONE","PERMAFROST","RD","SEAICE","SEALEVEL","SEASTATE","SEASURFACESALINITY","SNOW","SOILMOISTURE","SST","VEGETATION","WATERVAPOUR"],
"field02": ["L0","L1A","L1B","L1C","L2","L2P","L3","L3U","L3C","L3S","L4","IND"]
}
5 changes: 5 additions & 0 deletions checksit/vocabs/esa-cci-global-attrs-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"Conventions": ["CF-1.5","CF-1.6","CF-1.7","CF-1.8","CF-1.9"],
"project": "Climate Change Initiative - European Space Agency",
"license": "ESA CCI Data Policy: free and open access"
}
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# ESACCI
field00: __vocabs__:esa-cci-file-name-config:field00
# CCI Project (e.g. SEAICE)
field01: __vocabs__:esa-cci-file-name-config:field01
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Date and time
field06: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# File version
field07: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
30 changes: 30 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# GDS version
field07: __version__:^v\d?\d.?\d?\d?$
# File version
field08: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# Additional segregator (also stored in the 'product' vocabulary)
field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# File version
field07: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
28 changes: 28 additions & 0 deletions specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
file-name-format:
func: checksit.generic.check_generic_file_name
params:
vocab_checks:
# Date and time
field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
# ESACCI
field01: __vocabs__:esa-cci-file-name-config:field00
# Processing Level (e.g. L3C)
field02: __vocabs__:esa-cci-file-name-config:field02
# CCI Project (e.g. SEAICE)
field03: __vocabs__:esa-cci-file-name-config:field01
# Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
# https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
# Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
# http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
# GDS version
field07: __version__:^v\d?\d.?\d?\d?$
# File version
field08: __version__:^fv\d?\d.?\d?\d?$
segregator:
seg: '-'
extension:
ext: '.nc'
spec_verbose:
spec_verb: True
Loading
Loading