Merge pull request #43 from cedadev/url_vocabs

Changes to vocab checks
cedadev · Mar 1, 2024 · 7c1c336 · 7c1c336
2 parents e618d83 + 40e017d
commit 7c1c336
Show file tree

Hide file tree

Showing 7 changed files with 107 additions and 28 deletions.
diff --git a/checksit/cvs.py b/checksit/cvs.py
@@ -2,6 +2,7 @@
 import re
 import json
 from collections import deque
+import requests
 
 
 from .config import get_config
@@ -23,10 +24,27 @@ def _load(self, vocab_id):
         vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
         self._vocabs[vocab_id] = json.load(open(vocab_file))
 
+    def _load_from_url(self, vocab_id):
+        # Loads a specific vocabulary from a URL
+        vocab_id_url = vocab_id.replace("__URL__","https://")
+        if vocab_id_url.startswith("https://raw.githubusercontent.com") and "/__latest__/" in vocab_id_url:
+            vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
+            vocab_id_url_base = vocab_id_url_base.replace("raw.githubusercontent.com","github.com")
+            latest_version = requests.get(f"{vocab_id_url_base}/releases/latest").url.split("/")[-1]
+            vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
+        res = requests.get(vocab_id_url.replace("__URL__","https://"))
+        if res.status_code == 200:
+            self._vocabs[vocab_id] = res.json()
+        else:
+            print(f"[WARNING] Failed to load vocab: {vocab_id}")
+
     def __getitem__(self, vocab_id):
         # Enables dictionary access to individual vocabulary items
         if vocab_id not in self._vocabs:
-            self._load(vocab_id)
+            if vocab_id.startswith("__URL__"):
+                self._load_from_url(vocab_id)
+            else:
+                self._load(vocab_id)
 
         return self._vocabs[vocab_id] 
 

diff --git a/checksit/generic.py b/checksit/generic.py
@@ -267,13 +267,14 @@ def check_var(dct, variable, defined_attrs, attr_rules=[], skip_spellcheck=False
     return errors, warnings
 
 
-def check_file_name(file_name, vocab_checks=None, **kwargs):
+def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):
     """
     Checks format of file name
 
     Works for NCAS-GENERAL, would work for NCAS-RADAR if radar scan type is added as data product
     """
     vocab_checks = vocab_checks or {}
+    rule_checks = rule_checks or {}
     errors = []
     warnings = []
     file_name_parts = file_name.split("_")
@@ -287,11 +288,15 @@ def check_file_name(file_name, vocab_checks=None, **kwargs):
         raise KeyError(msg)
 
     # check platform
-    if "platform" in vocab_checks.keys():
-        if vocabs.check(vocab_checks["platform"], file_name_parts[1], label="_") != []:
-            errors.append(f"[file name]: Invalid file name format - unknown platform {file_name_parts[1]}")
+    if "platform" in rule_checks.keys():
+        if rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -") != ([], []):
+            rule_errors, rule_warnings = rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -")
+            if rule_errors != []:
+                errors.extend(rule_errors)
+            if rule_warnings != []:
+                warnings.extend(rule_warnings)
     else:
-        msg = "No platform vocab defined in specs"
+        msg = "No platform rule defined in specs"
         raise KeyError(msg)
 
     # check date format

diff --git a/checksit/rules/rule_funcs.py b/checksit/rules/rule_funcs.py
@@ -269,6 +269,40 @@ def longitude(value, context, extras=None, label=""):
     return errors
 
 
+def ceda_platform(value, context, extras=None, label=""):
+    """
+    A function to check if the platform is in the CEDA catalogue API
+    """
+    errors = []
+    api_result = requests.get(f"http://api.catalogue.ceda.ac.uk/api/v2/identifiers.json/?url={value}")
+    if (len(api_result.json()['results']) == 1) and (api_result.json()['results'][0]['relatedTo']['short_code'] == "plat"):
+        legit_platform = True
+    else:
+        legit_platform = False
+
+    if not legit_platform:
+        errors.append(f"{label} '{value}' is not a valid platform in the CEDA catalogue")
+
+    return errors
+
+
+def ncas_platform(value, context, extras=None, label=""):
+    """
+    A function to check if the platform is in the NCAS platform list
+    """
+    errors = []
+
+    latest_version = requests.get("https://github.com/ncasuk/ncas-data-platform-vocabs/releases/latest").url.split("/")[-1]
+
+    result = requests.get(f"https://raw.githubusercontent.com/ncasuk/ncas-data-platform-vocabs/{latest_version}/AMF_CVs/AMF_platform.json")
+    ncas_platforms = result.json()['platform'].keys()
+
+    if value not in ncas_platforms:
+        errors.append(f"{label} '{value}' is not a valid NCAS platform")
+
+    return errors
+
+
 def check_qc_flags(value, context, extras=None, label=""):
     """
     A function to check flag_values and flag_meanings

diff --git a/specs/groups/ncas-amof-2.0.0/amof-file-name.yml b/specs/groups/ncas-amof-2.0.0/amof-file-name.yml
@@ -2,6 +2,7 @@ file-name-format:
   func: checksit.generic.check_file_name
   params:
     vocab_checks:
-      instrument: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__
-      platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
+      instrument: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__
       data_product: __vocabs__:AMF_CVs/2.0.0/AMF_product:product
+    rule_checks:
+      platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
diff --git a/specs/groups/ncas-amof-2.0.0/amof-global-attrs.yml b/specs/groups/ncas-amof-2.0.0/amof-global-attrs.yml
@@ -2,9 +2,9 @@ required-global-attrs:
   func: checksit.generic.check_global_attrs
   params:
     vocab_attrs:
-      source: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__:description
-      platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__
+      source: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__:description
     rules_attrs:
+      platform: rule-func:ceda-platform, rule-func-warning:ncas-platform
       Conventions: regex:CF-1.6,\sNCAS-AMF-2.0.0
       instrument_manufacturer: rule-func:string-of-length:2+
       instrument_model: rule-func:string-of-length:3+

diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -104,7 +104,7 @@ def test_check_global_attrs():
     assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
     assert warnings == []
 
-    # Test function handles undefined attributes with vocab checks correctly 
+    # Test function handles undefined attributes with vocab checks correctly
     vocab_attrs = {
         "attr1": "__vocabs__:tests/test_platforms:test_platforms:__all__"
     }
@@ -136,7 +136,7 @@ def test_check_global_attrs():
     assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
     assert warnings == []
 
-    # Test function handles undefined attributes with regex checks correctly 
+    # Test function handles undefined attributes with regex checks correctly
     regex_attrs = {
         "attr1": r"\d{4}-\d{2}-\d{2}"
     }
@@ -168,7 +168,7 @@ def test_check_global_attrs():
     assert errors == ["[global-attributes:**************:attr4]: Attribute 'attr4' does not exist. "]
     assert warnings == []
 
-    # Test function handles undefined attributes with rules checks correctly 
+    # Test function handles undefined attributes with rules checks correctly
     rules_attrs = {
         "attr1": "rule-func:string-of-length:5"
     }
@@ -347,68 +347,70 @@ def test_check_file_name():
     # Test that the function correctly identifies invalid instrument name
     vocab_checks = {
         "instrument": "__vocabs__:tests/test_instruments:test_instruments:__all__",
-        "platform": "__vocabs__:tests/test_platforms:test_platforms:__all__",
         "data_product": "__vocabs__:tests/test_products:test_products"
     }
+    rule_checks = {
+        "platform": "rule-func:match-one-of:plat1|plat2"
+    }
     file_name = "inst3_plat1_20220101_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - unknown instrument inst3"]
     assert warnings == []
 
     # Test that the function correctly identifies invalid platform name
     file_name = "inst1_plat3_20220101_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
-    assert errors == ["[file name]: Invalid file name format - unknown platform plat3"]
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
+    assert errors == ["[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
     assert warnings == []
 
     # Test that the function correctly identifies invalid date format
     file_name = "inst1_plat1_2022010_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - bad date format 2022010"]
     assert warnings == []
 
     # Test that the function correctly identifies invalid date
     file_name = "inst1_plat1_20221301_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - invalid date in file name 20221301"]
     assert warnings == []
 
     # Test that the function correctly identifies invalid data product
     file_name = "inst1_plat1_20220101_prod3_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - unknown data product prod3"]
     assert warnings == []
 
     # Test that the function correctly identifies invalid version number format
     file_name = "inst1_plat1_20220101_prod1_v10.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - incorrect file version number v10"]
     assert warnings == []
 
     # Test that the function correctly identifies too many options in file name
     file_name = "inst1_plat1_20220101_prod1_option1_option2_option3_option4_option5_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == ["[file name]: Invalid file name format - too many options in file name"]
     assert warnings == []
 
     # Test that the function correctly handles multiple errors
     file_name = "inst3_plat3_20220101_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
-    assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - unknown platform plat3"]
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
+    assert errors == ["[file name]: Invalid file name format - unknown instrument inst3","[file name]: Invalid file name format - 'plat3' must be one of: '['plat1', 'plat2']'"]
     assert warnings == []
 
     # Test that the function correctly handles valid file names
     file_name = "inst1_plat1_20220101_prod1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == []
     assert warnings == []
 
     file_name = "inst1_plat1_20220101_prod1_opt1_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == []
     assert warnings == []
 
     file_name = "inst1_plat1_20220101_prod1_opt1_opt2_opt3_v1.0.nc"
-    errors, warnings = cg.check_file_name(file_name, vocab_checks)
+    errors, warnings = cg.check_file_name(file_name, vocab_checks, rule_checks)
     assert errors == []
-    assert warnings == []
+    assert warnings == []
diff --git a/tests/test_rules.py b/tests/test_rules.py
@@ -204,6 +204,25 @@ def test_longitude():
     assert crf.longitude('200.0000', {}, label='Test') == ["Test '200.0000' must be within -180 and +180 "]
 
 
+def test_ceda_platform():
+    # Test function returns no errors for all NCAS platforms
+    for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
+        assert crf.ceda_platform(plat, {}) == []
+    # Test function returns no errors for a non-NCAS platform
+    assert crf.ceda_platform("netheravon", {}) == []
+    # Test function returns error for example platform
+    assert crf.ceda_platform("example", {}, label='Test') == ["Test 'example' is not a valid platform in the CEDA catalogue"]
+
+
+def test_ncas_platform():
+    # Test function returns no errors for all NCAS platforms
+    for plat in ["bt-tower-t35", "cao", "cao-sparsholt", "cdao", "cdao-frongoch", "cvao", "faam", "iao", "wao"]:
+        assert crf.ncas_platform(plat, {}) == []
+    # Test function returns error for a non-NCAS platform
+    assert crf.ncas_platform("netheravon", {}, label='Test') == ["Test 'netheravon' is not a valid NCAS platform"]
+    # Test function returns error for example platform
+    assert crf.ncas_platform("example", {}, label='Test') == ["Test 'example' is not a valid NCAS platform"]
+
 # rules.py
 def _test_type(_type, value):
     return r.check(f"type-rule:{_type}", value)