Skip to content

Commit

Permalink
Merge pull request #43 from cedadev/url_vocabs
Browse files Browse the repository at this point in the history
Changes to vocab checks
  • Loading branch information
joshua-hampton authored Mar 1, 2024
2 parents e618d83 + 40e017d commit 7c1c336
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 28 deletions.
20 changes: 19 additions & 1 deletion checksit/cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import json
from collections import deque
import requests


from .config import get_config
Expand All @@ -23,10 +24,27 @@ def _load(self, vocab_id):
vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
self._vocabs[vocab_id] = json.load(open(vocab_file))

def _load_from_url(self, vocab_id):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__","https://")
if vocab_id_url.startswith("https://raw.githubusercontent.com") and "/__latest__/" in vocab_id_url:
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace("raw.githubusercontent.com","github.com")
latest_version = requests.get(f"{vocab_id_url_base}/releases/latest").url.split("/")[-1]
vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
res = requests.get(vocab_id_url.replace("__URL__","https://"))
if res.status_code == 200:
self._vocabs[vocab_id] = res.json()
else:
print(f"[WARNING] Failed to load vocab: {vocab_id}")

def __getitem__(self, vocab_id):
# Enables dictionary access to individual vocabulary items
if vocab_id not in self._vocabs:
self._load(vocab_id)
if vocab_id.startswith("__URL__"):
self._load_from_url(vocab_id)
else:
self._load(vocab_id)

return self._vocabs[vocab_id]

Expand Down
15 changes: 10 additions & 5 deletions checksit/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,14 @@ def check_var(dct, variable, defined_attrs, attr_rules=[], skip_spellcheck=False
return errors, warnings


def check_file_name(file_name, vocab_checks=None, **kwargs):
def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):
"""
Checks format of file name
Works for NCAS-GENERAL, would work for NCAS-RADAR if radar scan type is added as data product
"""
vocab_checks = vocab_checks or {}
rule_checks = rule_checks or {}
errors = []
warnings = []
file_name_parts = file_name.split("_")
Expand All @@ -287,11 +288,15 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
raise KeyError(msg)

# check platform
if "platform" in vocab_checks.keys():
if vocabs.check(vocab_checks["platform"], file_name_parts[1], label="_") != []:
errors.append(f"[file name]: Invalid file name format - unknown platform {file_name_parts[1]}")
if "platform" in rule_checks.keys():
if rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -") != ([], []):
rule_errors, rule_warnings = rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -")
if rule_errors != []:
errors.extend(rule_errors)
if rule_warnings != []:
warnings.extend(rule_warnings)
else:
msg = "No platform vocab defined in specs"
msg = "No platform rule defined in specs"
raise KeyError(msg)

# check date format
Expand Down
34 changes: 34 additions & 0 deletions checksit/rules/rule_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,40 @@ def longitude(value, context, extras=None, label=""):
return errors


def ceda_platform(value, context, extras=None, label=""):
"""
A function to check if the platform is in the CEDA catalogue API
"""
errors = []
api_result = requests.get(f"http://api.catalogue.ceda.ac.uk/api/v2/identifiers.json/?url={value}")
if (len(api_result.json()['results']) == 1) and (api_result.json()['results'][0]['relatedTo']['short_code'] == "plat"):
legit_platform = True
else:
legit_platform = False

if not legit_platform:
errors.append(f"{label} '{value}' is not a valid platform in the CEDA catalogue")

return errors


def ncas_platform(value, context, extras=None, label=""):
"""
A function to check if the platform is in the NCAS platform list
"""
errors = []

latest_version = requests.get("https://github.com/ncasuk/ncas-data-platform-vocabs/releases/latest").url.split("/")[-1]

result = requests.get(f"https://raw.githubusercontent.com/ncasuk/ncas-data-platform-vocabs/{latest_version}/AMF_CVs/AMF_platform.json")
ncas_platforms = result.json()['platform'].keys()

if value not in ncas_platforms:
errors.append(f"{label} '{value}' is not a valid NCAS platform")

return errors


def check_qc_flags(value, context, extras=None, label=""):
"""
A function to check flag_values and flag_meanings
Expand Down
5 changes: 3 additions & 2 deletions specs/groups/ncas-amof-2.0.0/amof-file-name.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ file-name-format:
func: checksit.generic.check_file_name
params:
vocab_checks:
instrument: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__
platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
instrument: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__
data_product: __vocabs__:AMF_CVs/2.0.0/AMF_product:product
rule_checks:
platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
4 changes: 2 additions & 2 deletions specs/groups/ncas-amof-2.0.0/amof-global-attrs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ required-global-attrs:
func: checksit.generic.check_global_attrs
params:
vocab_attrs:
source: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__:description
platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
source: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__:description
rules_attrs:
platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
Conventions: regex:CF-1.6,\sNCAS-AMF-2.0.0
instrument_manufacturer: rule-func:string-of-length:2+
instrument_model: rule-func:string-of-length:3+
Expand Down
38 changes: 20 additions & 18 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with vocab checks correctly
# Test function handles undefined attributes with vocab checks correctly
vocab_attrs = {
"attr1": "__vocabs__:tests/test_platforms:test_platforms:__all__"
}
Expand Down Expand Up @@ -136,7 +136,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with regex checks correctly
# Test function handles undefined attributes with regex checks correctly
regex_attrs = {
"attr1": r"\d{4}-\d{2}-\d{2}"
}
Expand Down Expand Up @@ -168,7 +168,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with rules checks correctly
# Test function handles undefined attributes with rules checks correctly
rules_attrs = {
"attr1": "rule-func:string-of-length:5"
}
Expand Down Expand Up @@ -347,68 +347,70 @@ def test_check_file_name():
# Test that the function correctly identifies invalid instrument name
vocab_checks = {
"instrument": "__vocabs__:tests/test_instruments:test_instruments:__all__",
"platform": "__vocabs__:tests/test_platforms:test_platforms:__all__",
"data_product": "__vocabs__:tests/test_products:test_products"
}
rule_checks = {
"platform": "rule-func:match-one-of:plat1|plat2"
}
file_name = "inst3_plat1_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3"]
assert warnings == []

# Test that the function correctly identifies invalid platform name
file_name = "inst1_plat3_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
assert errors == ["[file name]: Invalid file name format - unknown platform plat3"]
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
assert warnings == []

# Test that the function correctly identifies invalid date format
file_name = "inst1_plat1_2022010_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - bad date format 2022010"]
assert warnings == []

# Test that the function correctly identifies invalid date
file_name = "inst1_plat1_20221301_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - invalid date in file name 20221301"]
assert warnings == []

# Test that the function correctly identifies invalid data product
file_name = "inst1_plat1_20220101_prod3_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown data product prod3"]
assert warnings == []

# Test that the function correctly identifies invalid version number format
file_name = "inst1_plat1_20220101_prod1_v10.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - incorrect file version number v10"]
assert warnings == []

# Test that the function correctly identifies too many options in file name
file_name = "inst1_plat1_20220101_prod1_option1_option2_option3_option4_option5_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - too many options in file name"]
assert warnings == []

# Test that the function correctly handles multiple errors
file_name = "inst3_plat3_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - unknown platform plat3"]
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
assert warnings == []

# Test that the function correctly handles valid file names
file_name = "inst1_plat1_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []

file_name = "inst1_plat1_20220101_prod1_opt1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []

file_name = "inst1_plat1_20220101_prod1_opt1_opt2_opt3_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []
assert warnings == []
19 changes: 19 additions & 0 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,25 @@ def test_longitude():
assert crf.longitude('200.0000', {}, label='Test') == ["Test '200.0000' must be within -180 and +180 "]


def test_ceda_platform():
# Test function returns no errors for all NCAS platforms
for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
assert crf.ceda_platform(plat, {}) == []
# Test function returns no errors for a non-NCAS platform
assert crf.ceda_platform("netheravon", {}) == []
# Test function returns error for example platform
assert crf.ceda_platform("example", {}, label='Test') == ["Test 'example' is not a valid platform in the CEDA catalogue"]


def test_ncas_platform():
# Test function returns no errors for all NCAS platforms
for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
assert crf.ncas_platform(plat, {}) == []
# Test function returns error for a non-NCAS platform
assert crf.ncas_platform("netheravon", {}, label='Test') == ["Test 'netheravon' is not a valid NCAS platform"]
# Test function returns error for example platform
assert crf.ncas_platform("example", {}, label='Test') == ["Test 'example' is not a valid NCAS platform"]

# rules.py
def _test_type(_type, value):
return r.check(f"type-rule:{_type}", value)
Expand Down

0 comments on commit 7c1c336

Please sign in to comment.