Merge pull request #53 from cedadev/esa-cci

ESA CCI file checking
cedadev · Dec 20, 2024 · 3ee22df · 3ee22df
2 parents cd9cbf7 + 01f8438
commit 3ee22df
Show file tree

Hide file tree

Showing 14 changed files with 490 additions and 17 deletions.
diff --git a/checksit/check.py b/checksit/check.py
@@ -196,7 +196,7 @@ def _check_file(
 
         for spec in specs:
             sr = SpecificationChecker(spec)
-            if "amof-file-name" in spec:
+            if "file-name" in spec:
                 spec_errors, spec_warnings = sr.run_checks(
                     file_content.inpt.split("/")[-1]
                 )

diff --git a/checksit/cvs.py b/checksit/cvs.py
@@ -25,26 +25,57 @@ def _load(self, vocab_id):
         vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
         self._vocabs[vocab_id] = json.load(open(vocab_file))
 
-    def _load_from_url(self, vocab_id):
-        # Loads a specific vocabulary from a URL
-        vocab_id_url = vocab_id.replace("__URL__", "https://")
-        if (
-            vocab_id_url.startswith("https://raw.githubusercontent.com")
-            and "/__latest__/" in vocab_id_url
-        ):
-            vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
-            vocab_id_url_base = vocab_id_url_base.replace(
-                "raw.githubusercontent.com", "github.com"
-            )
+    def _load_from_url_github(self, vocab_id_url: str):
+        vocab_list = []
+        vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
+        vocab_id_url_base = vocab_id_url_base.replace(
+            "raw.githubusercontent.com", "github.com"
+        )
+        if "/__latest__/" in vocab_id_url:
             latest_version = requests.get(
                 f"{vocab_id_url_base}/releases/latest"
             ).url.split("/")[-1]
             vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
         res = requests.get(vocab_id_url.replace("__URL__", "https://"))
-        if res.status_code == 200:
-            self._vocabs[vocab_id] = res.json()
+        if res.status_code != 200:
+            print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
+            return vocab_list
+        vocab_list = res.json()
+
+        return vocab_list
+
+    def _load_from_url_esacci(self, vocab_id_url: str):
+        vocab_list = []
+        res = requests.get(vocab_id_url)
+        if res.status_code != 200:
+            print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
+            return vocab_list
+        js = res.json()
+
+        if 'dataType' in vocab_id_url:
+            vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")])
+        elif 'product' in vocab_id_url:
+            vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")])
+        else:
+            print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}")
+
+        return vocab_list
+
+    def _load_from_url(self, vocab_id: str):
+        # Loads a specific vocabulary from a URL
+        vocab_id_url = vocab_id.replace("__URL__", "https://")
+        if (
+            vocab_id_url.startswith("https://raw.githubusercontent.com")
+        ):
+            vocab_list=self._load_from_url_github(vocab_id_url)
+        elif (
+            vocab_id_url.startswith("https://vocab.ceda.ac.uk")
+        ):
+            vocab_list=self._load_from_url_esacci(vocab_id_url)
         else:
-            print(f"[WARNING] Failed to load vocab: {vocab_id}")
+            print(f"Vocabulary url provided is not recognised: {vocab_id_url}")
+
+        self._vocabs[vocab_id] = vocab_list
 
     def __getitem__(self, vocab_id):
         # Enables dictionary access to individual vocabulary items
@@ -85,16 +116,21 @@ def lookup(self, vocab_lookup):
 
         return obj
 
-    def check(self, vocab_lookup, value, label="", lookup=True):
+    def check(self, vocab_lookup, value, label="", lookup=True, spec_verb=False):
         # Return a list of errors - empty list if no errors
         errors = []
         options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0]
+        if spec_verb:
+            print(f"Vocab lookup: {vocab_lookup}")
 
         if isinstance(options, list):
             if value not in options:
                 errors.append(
                     f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')"
                 )
+            else:
+                if spec_verb:
+                    print(f"Value: {value} is in list {options}")
         elif isinstance(options, dict):
             for key in options.keys():
                 if key in value.keys():

diff --git a/checksit/generic.py b/checksit/generic.py
@@ -11,7 +11,9 @@
 DATE_REGEX = re.compile(
     r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$"
 )
-
+DATE_REGEX_GENERIC = re.compile(
+    r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{10}$|^\d{12}$|^\d{14}$"
+)
 
 def _get_bounds_var_ids(dct):
     return [
@@ -552,6 +554,132 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs):
 
     return errors, warnings
 
+def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, spec_verbose=False, **kwargs):
+    # Requires yaml file containing a list of file name fields and segregators
+    # Loop over each file field and segregator until there are no more
+    # check against defined file extension
+
+    vocab_checks = vocab_checks or {}
+    try:
+        seg = segregator["seg"]
+    except:
+        seg='_'
+    try:
+        ext = extension["ext"]
+    except:
+        ext = '.test'
+    try:
+        spec_verb = spec_verbose["spec_verb"]
+    except:
+        spec_verb = False
+
+    errors = []
+    warnings = []
+
+    # get filename parts
+    if not isinstance(file_name,str):
+        raise ValueError
+
+    extracted_name = file_name.replace(ext,'')
+    file_name_parts = extracted_name.split(seg)
+
+    if spec_verb:
+        print(f"File name: {file_name}")
+        print(f"Segregator: {seg}")
+        print(f"Extension: {ext}")
+        print(f"All file name parts: {file_name_parts}")
+
+    # Loop over file name parts
+    for idx, key in enumerate(file_name_parts):
+        if spec_verb:
+            print('')
+            print(idx, key)
+        num=f"{idx:02}"
+
+        # Check if number of file name parts matches the number of fields specified in the user-defined yaml file
+        if len(vocab_checks) < len(file_name_parts):
+            errors.append(
+                        f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected."
+                    )
+            if spec_verb:
+                print(errors[-1])
+            break
+        elif len(vocab_checks) > len(file_name_parts):
+            errors.append(
+                        f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected."
+                    )
+            if spec_verb:
+                print(errors[-1])
+            break
+        else:
+            field=vocab_checks["field"+num]
+
+            if field.startswith('__vocabs__') or field.startswith('__URL__'):
+                # VOCAB (config or URL)
+                if (
+                        vocabs.check(field, key, spec_verb=spec_verb)
+                        != []
+                    ):
+                        errors.append(
+                            f"[file name]: Unknown field '{key}' in vocab {field}."
+                        )
+                        if spec_verb:
+                            print(errors[-1])
+
+            elif field.startswith('__date__'):
+                # DATE REGEX
+                datefmts=(field.split(":"))[1]
+                fmts=(datefmts.split(","))
+                if spec_verb:
+                    print(f"Valid date formats: {fmts}")
+
+                if not DATE_REGEX_GENERIC.match(key):
+                    errors.append(
+                        f"[file name]: Expecting date/time - bad date format '{key}'"
+                    )
+                    if spec_verb:
+                        print(errors[-1])
+                else:
+                    valid_date_found = False
+                    for f in fmts:
+                        try:
+                            t = dt.datetime.strptime(key, f)
+                            valid_date_found = True
+                            break
+                        except ValueError:
+                            pass
+                    if valid_date_found:
+                        if spec_verb:
+                            print(f"Date string {key} matches the required format")
+                    else:
+                        errors.append(
+                            f"[file name]: Invalid date/time string '{key}'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional."
+                        )
+                        if spec_verb:
+                            print(errors[-1])
+
+            elif field.startswith('__version__'):
+                # FILE/PRODUCT VERSION
+                verfmt=(field.split(":"))[1]
+                if re.match(verfmt, key):
+                    if spec_verb:
+                        print(f"File version {key} matches the required format")
+                else:
+                    errors.append(
+                        f"[file name]: Invalid file version '{key}'. File versions should take the form n{{1,}}[.n{{1,}}]."
+                    )
+                    if spec_verb:
+                        print(errors[-1])
+
+            else:
+                # FIELD NOT RECOGNISED
+                errors.append(
+                            f"[file name]: {field} field type not recognised."
+                        )
+                if spec_verb:
+                    print(errors[-1])
+
+    return errors, warnings
 
 def check_radar_moment_variables(
     dct, exist_attrs=None, rule_attrs=None, one_of_attrs=None, skip_spellcheck=False

diff --git a/checksit/vocabs/esa-cci-file-name-config.json b/checksit/vocabs/esa-cci-file-name-config.json
@@ -0,0 +1,5 @@
+{
+  "field00": ["ESACCI"],
+  "field01": ["AEROSOL","AIS","BIOMASS","CLOUD","FIRE","GHG","GHRSST","GIS","GLACIERS","HRLC","ICESHEETS","LAKES","LC","LST","OC","OZONE","PERMAFROST","RD","SEAICE","SEALEVEL","SEASTATE","SEASURFACESALINITY","SNOW","SOILMOISTURE","SST","VEGETATION","WATERVAPOUR"],
+  "field02": ["L0","L1A","L1B","L1C","L2","L2P","L3","L3U","L3C","L3S","L4","IND"]
+}
diff --git a/checksit/vocabs/esa-cci-global-attrs-config.json b/checksit/vocabs/esa-cci-global-attrs-config.json
@@ -0,0 +1,5 @@
+{
+  "Conventions": ["CF-1.5","CF-1.6","CF-1.7","CF-1.8","CF-1.9"],
+  "project": "Climate Change Initiative - European Space Agency",
+  "license": "ESA CCI Data Policy: free and open access"
+}
diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml
@@ -0,0 +1,28 @@
+file-name-format:
+  func: checksit.generic.check_generic_file_name
+  params:
+    vocab_checks:
+      # ESACCI
+      field00: __vocabs__:esa-cci-file-name-config:field00
+      # CCI Project (e.g. SEAICE)
+      field01: __vocabs__:esa-cci-file-name-config:field01
+      # Processing Level (e.g. L3C)
+      field02: __vocabs__:esa-cci-file-name-config:field02
+      # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
+      # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      # Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
+      # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # Additional segregator (also stored in the 'product' vocabulary)
+      field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # Date and time
+      field06: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
+      # File version
+      field07: __version__:^fv\d?\d.?\d?\d?$
+    segregator:
+      seg: '-'
+    extension:
+      ext: '.nc'
+    spec_verbose:
+      spec_verb: True
diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml
@@ -0,0 +1,30 @@
+file-name-format:
+  func: checksit.generic.check_generic_file_name
+  params:
+    vocab_checks:
+      # Date and time
+      field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
+      # ESACCI
+      field01: __vocabs__:esa-cci-file-name-config:field00
+      # Processing Level (e.g. L3C)
+      field02: __vocabs__:esa-cci-file-name-config:field02
+      # CCI Project (e.g. SEAICE)
+      field03: __vocabs__:esa-cci-file-name-config:field01
+      # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
+      # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      # Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
+      # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # Additional segregator (also stored in the 'product' vocabulary)
+      field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # GDS version
+      field07: __version__:^v\d?\d.?\d?\d?$
+      # File version
+      field08: __version__:^fv\d?\d.?\d?\d?$
+    segregator:
+      seg: '-'
+    extension:
+      ext: '.nc'
+    spec_verbose:
+      spec_verb: True
diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml
@@ -0,0 +1,28 @@
+file-name-format:
+  func: checksit.generic.check_generic_file_name
+  params:
+    vocab_checks:
+      # Date and time
+      field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
+      # ESACCI
+      field01: __vocabs__:esa-cci-file-name-config:field00
+      # Processing Level (e.g. L3C)
+      field02: __vocabs__:esa-cci-file-name-config:field02
+      # CCI Project (e.g. SEAICE)
+      field03: __vocabs__:esa-cci-file-name-config:field01
+      # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
+      # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      # Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
+      # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # Additional segregator (also stored in the 'product' vocabulary)
+      field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # File version
+      field07: __version__:^fv\d?\d.?\d?\d?$
+    segregator:
+      seg: '-'
+    extension:
+      ext: '.nc'
+    spec_verbose:
+      spec_verb: True
diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml
@@ -0,0 +1,28 @@
+file-name-format:
+  func: checksit.generic.check_generic_file_name
+  params:
+    vocab_checks:
+      # Date and time
+      field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S
+      # ESACCI
+      field01: __vocabs__:esa-cci-file-name-config:field00
+      # Processing Level (e.g. L3C)
+      field02: __vocabs__:esa-cci-file-name-config:field02
+      # CCI Project (e.g. SEAICE)
+      field03: __vocabs__:esa-cci-file-name-config:field01
+      # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary
+      # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json
+      # Product String (e.g. NIMBUS5_ESMR-EASE2_NH)
+      # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json
+      # GDS version
+      field07: __version__:^v\d?\d.?\d?\d?$
+      # File version
+      field08: __version__:^fv\d?\d.?\d?\d?$
+    segregator:
+      seg: '-'
+    extension:
+      ext: '.nc'
+    spec_verbose:
+      spec_verb: True