Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to vocab checks #43

Merged
merged 9 commits into from
Mar 1, 2024
20 changes: 19 additions & 1 deletion checksit/cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import json
from collections import deque
import requests


from .config import get_config
Expand All @@ -23,10 +24,27 @@ def _load(self, vocab_id):
vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
self._vocabs[vocab_id] = json.load(open(vocab_file))

def _load_from_url(self, vocab_id):
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__","https://")
if vocab_id_url.startswith("https://raw.githubusercontent.com") and "/__latest__/" in vocab_id_url:
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace("raw.githubusercontent.com","github.com")
latest_version = requests.get(f"{vocab_id_url_base}/releases/latest").url.split("/")[-1]
vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
res = requests.get(vocab_id_url.replace("__URL__","https://"))
if res.status_code == 200:
self._vocabs[vocab_id] = res.json()
else:
print(f"[WARNING] Failed to load vocab: {vocab_id}")

def __getitem__(self, vocab_id):
# Enables dictionary access to individual vocabulary items
if vocab_id not in self._vocabs:
self._load(vocab_id)
if vocab_id.startswith("__URL__"):
self._load_from_url(vocab_id)
else:
self._load(vocab_id)

return self._vocabs[vocab_id]

Expand Down
15 changes: 10 additions & 5 deletions checksit/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,14 @@ def check_var(dct, variable, defined_attrs, attr_rules=[], skip_spellcheck=False
return errors, warnings


def check_file_name(file_name, vocab_checks=None, **kwargs):
def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):
"""
Checks format of file name

Works for NCAS-GENERAL, would work for NCAS-RADAR if radar scan type is added as data product
"""
vocab_checks = vocab_checks or {}
rule_checks = rule_checks or {}
errors = []
warnings = []
file_name_parts = file_name.split("_")
Expand All @@ -287,11 +288,15 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
raise KeyError(msg)

# check platform
if "platform" in vocab_checks.keys():
if vocabs.check(vocab_checks["platform"], file_name_parts[1], label="_") != []:
errors.append(f"[file name]: Invalid file name format - unknown platform {file_name_parts[1]}")
if "platform" in rule_checks.keys():
if rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -") != ([], []):
rule_errors, rule_warnings = rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -")
if rule_errors != []:
errors.extend(rule_errors)
if rule_warnings != []:
warnings.extend(rule_warnings)
else:
msg = "No platform vocab defined in specs"
msg = "No platform rule defined in specs"
raise KeyError(msg)

# check date format
Expand Down
34 changes: 34 additions & 0 deletions checksit/rules/rule_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,40 @@ def longitude(value, context, extras=None, label=""):
return errors


def ceda_platform(value, context, extras=None, label=""):
"""
A function to check if the platform is in the CEDA catalogue API
"""
errors = []
api_result = requests.get(f"http://api.catalogue.ceda.ac.uk/api/v2/identifiers.json/?url={value}")
if (len(api_result.json()['results']) == 1) and (api_result.json()['results'][0]['relatedTo']['short_code'] == "plat"):
legit_platform = True
else:
legit_platform = False

if not legit_platform:
errors.append(f"{label} '{value}' is not a valid platform in the CEDA catalogue")

return errors


def ncas_platform(value, context, extras=None, label=""):
"""
A function to check if the platform is in the NCAS platform list
"""
errors = []

latest_version = requests.get("https://github.com/ncasuk/ncas-data-platform-vocabs/releases/latest").url.split("/")[-1]

result = requests.get(f"https://raw.githubusercontent.com/ncasuk/ncas-data-platform-vocabs/{latest_version}/AMF_CVs/AMF_platform.json")
ncas_platforms = result.json()['platform'].keys()

if value not in ncas_platforms:
errors.append(f"{label} '{value}' is not a valid NCAS platform")

return errors


def check_qc_flags(value, context, extras=None, label=""):
"""
A function to check flag_values and flag_meanings
Expand Down
5 changes: 3 additions & 2 deletions specs/groups/ncas-amof-2.0.0/amof-file-name.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ file-name-format:
func: checksit.generic.check_file_name
params:
vocab_checks:
instrument: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__
platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
instrument: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__
data_product: __vocabs__:AMF_CVs/2.0.0/AMF_product:product
rule_checks:
platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
4 changes: 2 additions & 2 deletions specs/groups/ncas-amof-2.0.0/amof-global-attrs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ required-global-attrs:
func: checksit.generic.check_global_attrs
params:
vocab_attrs:
source: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__:description
platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
source: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__:description
rules_attrs:
platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
Conventions: regex:CF-1.6,\sNCAS-AMF-2.0.0
instrument_manufacturer: rule-func:string-of-length:2+
instrument_model: rule-func:string-of-length:3+
Expand Down
38 changes: 20 additions & 18 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with vocab checks correctly
# Test function handles undefined attributes with vocab checks correctly
vocab_attrs = {
"attr1": "__vocabs__:tests/test_platforms:test_platforms:__all__"
}
Expand Down Expand Up @@ -136,7 +136,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with regex checks correctly
# Test function handles undefined attributes with regex checks correctly
regex_attrs = {
"attr1": r"\d{4}-\d{2}-\d{2}"
}
Expand Down Expand Up @@ -168,7 +168,7 @@ def test_check_global_attrs():
assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
assert warnings == []

# Test function handles undefined attributes with rules checks correctly
# Test function handles undefined attributes with rules checks correctly
rules_attrs = {
"attr1": "rule-func:string-of-length:5"
}
Expand Down Expand Up @@ -347,68 +347,70 @@ def test_check_file_name():
# Test that the function correctly identifies invalid instrument name
vocab_checks = {
"instrument": "__vocabs__:tests/test_instruments:test_instruments:__all__",
"platform": "__vocabs__:tests/test_platforms:test_platforms:__all__",
"data_product": "__vocabs__:tests/test_products:test_products"
}
rule_checks = {
"platform": "rule-func:match-one-of:plat1|plat2"
}
file_name = "inst3_plat1_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3"]
assert warnings == []

# Test that the function correctly identifies invalid platform name
file_name = "inst1_plat3_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
assert errors == ["[file name]: Invalid file name format - unknown platform plat3"]
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
assert warnings == []

# Test that the function correctly identifies invalid date format
file_name = "inst1_plat1_2022010_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - bad date format 2022010"]
assert warnings == []

# Test that the function correctly identifies invalid date
file_name = "inst1_plat1_20221301_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - invalid date in file name 20221301"]
assert warnings == []

# Test that the function correctly identifies invalid data product
file_name = "inst1_plat1_20220101_prod3_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown data product prod3"]
assert warnings == []

# Test that the function correctly identifies invalid version number format
file_name = "inst1_plat1_20220101_prod1_v10.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - incorrect file version number v10"]
assert warnings == []

# Test that the function correctly identifies too many options in file name
file_name = "inst1_plat1_20220101_prod1_option1_option2_option3_option4_option5_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - too many options in file name"]
assert warnings == []

# Test that the function correctly handles multiple errors
file_name = "inst3_plat3_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - unknown platform plat3"]
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
assert warnings == []

# Test that the function correctly handles valid file names
file_name = "inst1_plat1_20220101_prod1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []

file_name = "inst1_plat1_20220101_prod1_opt1_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []

file_name = "inst1_plat1_20220101_prod1_opt1_opt2_opt3_v1.0.nc"
errors, warnings = cg.check_file_name(file_name, vocab_checks)
errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
assert errors == []
assert warnings == []
assert warnings == []
19 changes: 19 additions & 0 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,25 @@ def test_longitude():
assert crf.longitude('200.0000', {}, label='Test') == ["Test '200.0000' must be within -180 and +180 "]


def test_ceda_platform():
# Test function returns no errors for all NCAS platforms
for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
assert crf.ceda_platform(plat, {}) == []
# Test function returns no errors for a non-NCAS platform
assert crf.ceda_platform("netheravon", {}) == []
# Test function returns error for example platform
assert crf.ceda_platform("example", {}, label='Test') == ["Test 'example' is not a valid platform in the CEDA catalogue"]


def test_ncas_platform():
# Test function returns no errors for all NCAS platforms
for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
assert crf.ncas_platform(plat, {}) == []
# Test function returns error for a non-NCAS platform
assert crf.ncas_platform("netheravon", {}, label='Test') == ["Test 'netheravon' is not a valid NCAS platform"]
# Test function returns error for example platform
assert crf.ncas_platform("example", {}, label='Test') == ["Test 'example' is not a valid NCAS platform"]

# rules.py
def _test_type(_type, value):
return r.check(f"type-rule:{_type}", value)
Expand Down
Loading