From f0f8876e3b9a66456f9c1a7e06b230893adb9944 Mon Sep 17 00:00:00 2001 From: knappett Date: Wed, 23 Oct 2024 10:57:25 +0100 Subject: [PATCH 01/11] Initial commit of esa-cci yml files --- specs/groups/esa-cci-attributes.yml | 66 +++++++++++++++++++++++++++++ specs/groups/esa-cci-file-name.yml | 37 ++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 specs/groups/esa-cci-attributes.yml create mode 100644 specs/groups/esa-cci-file-name.yml diff --git a/specs/groups/esa-cci-attributes.yml b/specs/groups/esa-cci-attributes.yml new file mode 100644 index 0000000..7e4b70a --- /dev/null +++ b/specs/groups/esa-cci-attributes.yml @@ -0,0 +1,66 @@ +#required-filename: +# func: checksit.generic.check_file_name + +required-global-attrs0: + func: checksit.generic.check_global_attrs + params: + defined_attrs: + - title + - institution + - source + - history + - references + - tracking_id + - Conventions + - product_version + - format_version + - summary + - keywords + - id + - naming_authority + - keywords_vocabulary + - cdm_data_type + - comment + - date_created + - creator_name + - creator_url + - creator_email + - project + - geospatial_lat_min + - geospatial_lat_max + - geospatial_lon_min + - geospatial_lon_max + - geospatial_vertical_min + - geospatial_vertical_max + - time_coverage_start + - time_coverage_end + - time_coverage_duration + - time_coverage_resolution + - standard_name_vocabulary + - license + - platform + - sensor + - spatial_resolution + - key_variables + vocab_attrs: + Conventions: __vocabs__:esa_cci_global_attrs:Conventions + project: __vocabs__:esa_cci_global_attrs:project + license: __vocabs__:esa_cci_global_attrs:license + +var-requires1: + func: checksit.generic.check_var + params: + variable: + - latitude + defined_attrs: + - units:degree_north + - standard_name:latitude + +var-requires3: + func: checksit.generic.check_var + params: + variable: + - longitude + defined_attrs: + - units:degree_east + - standard_name:longitude diff --git a/specs/groups/esa-cci-file-name.yml b/specs/groups/esa-cci-file-name.yml new file mode 100644 index 0000000..940ca2f --- /dev/null +++ b/specs/groups/esa-cci-file-name.yml @@ -0,0 +1,37 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # ESACCI + field00: __vocabs__:esa_cci_file_name:field00 + #segregator01: __vocabs__:esa_cci_file_name:segregator01 + # CCI Project (e.g. SEAICE) + field01: __vocabs__:esa_cci_file_name:field01 + #segregator02: __vocabs__:esa_cci_file_name:segregator02 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa_cci_file_name:field02 + #segregator03: __vocabs__:esa_cci_file_name:segregator03 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/collection/cci/cci-content/dataType.html + # https://vocab.ceda.ac.uk/collection/cci/cci-content/dataType.json + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + #segregator04: __vocabs__:esa_cci_file_name:segre04 + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # https://vocab.ceda.ac.uk/collection/cci/cci-content/product.html + # https://vocab.ceda.ac.uk/collection/cci/cci-content/product.json + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + #segregator05: __vocabs__:esa_cci_file_name:segre05 + # Additional field + #field06: "RegEx_Additional_segre" + #segregator06: __vocabs__:esa_cci_file_name:segre06 + # Date (and time?) + #field07: "RegEx_Date_and_RegEx_Time" + #segregator07: __vocabs__:esa_cci_file_name:segre07 + # File version + #field08: "RegEx_file_version" + segregator: + seg: '-' + extension: + ext: '.nc' From 78e3e80a92118ac336887f35bae3bd01a520db66 Mon Sep 17 00:00:00 2001 From: knappett Date: Mon, 4 Nov 2024 10:37:33 +0000 Subject: [PATCH 02/11] Working version to check standard esa cci file names against vocab config files orurl. --- checksit/check.py | 2 +- checksit/cvs.py | 54 +++++++++--- checksit/generic.py | 82 ++++++++++++++++++- checksit/vocabs/esa-cci-file-name-config.json | 5 ++ .../vocabs/esa-cci-global-attrs-config.json | 5 ++ specs/groups/esa-cci-file-name.yml | 25 ++---- ...ttributes.yml => esa-cci-global-attrs.yml} | 27 +----- specs/groups/esa-cci-variable-attrs.yml | 17 ++++ 8 files changed, 160 insertions(+), 57 deletions(-) create mode 100644 checksit/vocabs/esa-cci-file-name-config.json create mode 100644 checksit/vocabs/esa-cci-global-attrs-config.json rename specs/groups/{esa-cci-attributes.yml => esa-cci-global-attrs.yml} (62%) create mode 100644 specs/groups/esa-cci-variable-attrs.yml diff --git a/checksit/check.py b/checksit/check.py index 0c86044..8d2f523 100644 --- a/checksit/check.py +++ b/checksit/check.py @@ -196,7 +196,7 @@ def _check_file( for spec in specs: sr = SpecificationChecker(spec) - if "amof-file-name" in spec: + if "file-name" in spec: spec_errors, spec_warnings = sr.run_checks( file_content.inpt.split("/")[-1] ) diff --git a/checksit/cvs.py b/checksit/cvs.py index 23c9bac..2ffd117 100644 --- a/checksit/cvs.py +++ b/checksit/cvs.py @@ -25,6 +25,37 @@ def _load(self, vocab_id): vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json") self._vocabs[vocab_id] = json.load(open(vocab_file)) + def _load_from_url_ncas(self, vocab_id_url): + vocab_id_url_base = vocab_id_url.split("/__latest__")[0] + vocab_id_url_base = vocab_id_url_base.replace( + "raw.githubusercontent.com", "github.com" + ) + latest_version = requests.get( + f"{vocab_id_url_base}/releases/latest" + ).url.split("/")[-1] + vocab_id_url = vocab_id_url.replace("__latest__", latest_version) + res = requests.get(vocab_id_url.replace("__URL__", "https://")) + if res.status_code == 200: + vocab_list = res.json() + else: + print(f"[WARNING] Failed to load vocab: {vocab_id_url}") + return vocab_list + + def _load_from_url_esacci(self, vocab_id_url): + res = requests.get(vocab_id_url) + if res.status_code == 200: + js = res.json() + + if 'dataType' in vocab_id_url: + vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")]) + elif 'product' in vocab_id_url: + vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")]) + else: + print('ESA CCI vocab url not recognised.') + else: + print(f"[WARNING] Failed to load vocab: {vocab_id_url}") + return vocab_list + def _load_from_url(self, vocab_id): # Loads a specific vocabulary from a URL vocab_id_url = vocab_id.replace("__URL__", "https://") @@ -32,19 +63,15 @@ def _load_from_url(self, vocab_id): vocab_id_url.startswith("https://raw.githubusercontent.com") and "/__latest__/" in vocab_id_url ): - vocab_id_url_base = vocab_id_url.split("/__latest__")[0] - vocab_id_url_base = vocab_id_url_base.replace( - "raw.githubusercontent.com", "github.com" - ) - latest_version = requests.get( - f"{vocab_id_url_base}/releases/latest" - ).url.split("/")[-1] - vocab_id_url = vocab_id_url.replace("__latest__", latest_version) - res = requests.get(vocab_id_url.replace("__URL__", "https://")) - if res.status_code == 200: - self._vocabs[vocab_id] = res.json() + vocab_list=self._load_from_url_ncas(vocab_id_url) + elif ( + vocab_id_url.startswith("https://vocab.ceda.ac.uk") + ): + vocab_list=self._load_from_url_esacci(vocab_id_url) else: - print(f"[WARNING] Failed to load vocab: {vocab_id}") + print(f"Vocabulary url provided is not recognised") + + self._vocabs[vocab_id] = vocab_list def __getitem__(self, vocab_id): # Enables dictionary access to individual vocabulary items @@ -89,12 +116,15 @@ def check(self, vocab_lookup, value, label="", lookup=True): # Return a list of errors - empty list if no errors errors = [] options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0] + print(f"Vocab lookup: {vocab_lookup}") if isinstance(options, list): if value not in options: errors.append( f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')" ) + else: + print(f"Value: {value} is in list {options}") elif isinstance(options, dict): for key in options.keys(): if key in value.keys(): diff --git a/checksit/generic.py b/checksit/generic.py index c000e7e..7a63763 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -11,7 +11,9 @@ DATE_REGEX = re.compile( r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$" ) - +DATE_REGEX_GENERIC = re.compile( + r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{10}$|^\d{12}$|^\d{14}$" +) def _get_bounds_var_ids(dct): return [ @@ -552,6 +554,84 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): return errors, warnings +def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, **kwargs): + # Requires yaml file containing a list of file name fields and segregators + # Loop over each file field and segregator until there are no more + # check against defined file extension + + vocab_checks = vocab_checks or {} + seg = segregator["seg"] or '_' #'-' + ext = extension["ext"] or '.test' #'.nc' + errors = [] + warnings = [] + # get filename parts + extracted_name = file_name.removesuffix(ext) + file_name_parts = extracted_name.split(seg) + + print(f"File name: {file_name}") + print(f"Segregator: {seg}") + print(f"Extension: {ext}") + print(f"All file name parts: {file_name_parts}") + + # Loop over file name parts + # Assume fields in yml file are in the same order + + for idx, key in enumerate(file_name_parts): + print('') + print(idx, key) + num=f"{idx:02}" + field=vocab_checks["field"+num] + + if field.startswith('__vocabs__') or field.startswith('__URL__'): + if ( + vocabs.check(field, key) + != [] + ): + errors.append( + f"[file name]: Invalid file name format - unknown field '{key}'" + ) + print(errors) + elif field.startswith('__date__'): + datefmts=(field.split(":"))[1] + fmts=(datefmts.split(",")) + print(f"Valid date formats: {fmts}") + + if not DATE_REGEX_GENERIC.match(key): + errors.append( + f"[file name]: Invalid file name format - bad date format '{key}'" + ) + else: + valid_date_found = False + for f in fmts: + try: + t = dt.datetime.strptime(key, f) + valid_date_found = True + break + except ValueError: + pass + if valid_date_found: + print(f"Date string {key} matches the required format") + else: + errors.append( + f"[file name]: Invalid file name format - invalid date in file name '{key}'" + ) + print(errors) + elif field.startswith('__version__'): + verfmt=(field.split(":"))[1] + if re.match(verfmt, key): + print(f"File version {key} matches the required format") + else: + errors.append( + f"[file name]: Invalid file name format - invalid file version in file name '{key}'" + ) + print(errors) + else: + errors.append( + f"[file name]: {field} field type not recognised" + ) + print(errors) + + return errors, warnings def check_radar_moment_variables( dct, exist_attrs=None, rule_attrs=None, one_of_attrs=None, skip_spellcheck=False diff --git a/checksit/vocabs/esa-cci-file-name-config.json b/checksit/vocabs/esa-cci-file-name-config.json new file mode 100644 index 0000000..ad20a83 --- /dev/null +++ b/checksit/vocabs/esa-cci-file-name-config.json @@ -0,0 +1,5 @@ +{ + "field00": ["ESACCI"], + "field01": ["AEROSOL","AIS","BIOMASS","CLOUD","FIRE","GHG","GHRSST","GIS","GLACIERS","HRLC","ICESHEETS","LAKES","LC","LST","OC","OZONE","PERMAFROST","RD","SEAICE","SEALEVEL","SEASTATE","SEASURFACESALINITY","SNOW","SOILMOISTURE","SST","VEGETATION","WATERVAPOUR"], + "field02": ["L0","L1A","L1B","L1C","L2","L2P","L3","L3U","L3C","L3S","L4","IND"] +} \ No newline at end of file diff --git a/checksit/vocabs/esa-cci-global-attrs-config.json b/checksit/vocabs/esa-cci-global-attrs-config.json new file mode 100644 index 0000000..3fbf852 --- /dev/null +++ b/checksit/vocabs/esa-cci-global-attrs-config.json @@ -0,0 +1,5 @@ +{ + "Conventions": ["CF-1.5","CF-1.6","CF-1.7","CF-1.8","CF-1.9"], + "project": "Climate Change Initiative - European Space Agency", + "license": "ESA CCI Data Policy: free and open access" +} \ No newline at end of file diff --git a/specs/groups/esa-cci-file-name.yml b/specs/groups/esa-cci-file-name.yml index 940ca2f..8ec230c 100644 --- a/specs/groups/esa-cci-file-name.yml +++ b/specs/groups/esa-cci-file-name.yml @@ -3,34 +3,21 @@ file-name-format: params: vocab_checks: # ESACCI - field00: __vocabs__:esa_cci_file_name:field00 - #segregator01: __vocabs__:esa_cci_file_name:segregator01 + field00: __vocabs__:esa-cci-file-name-config:field00 # CCI Project (e.g. SEAICE) - field01: __vocabs__:esa_cci_file_name:field01 - #segregator02: __vocabs__:esa_cci_file_name:segregator02 + field01: __vocabs__:esa-cci-file-name-config:field01 # Processing Level (e.g. L3C) - field02: __vocabs__:esa_cci_file_name:field02 - #segregator03: __vocabs__:esa_cci_file_name:segregator03 + field02: __vocabs__:esa-cci-file-name-config:field02 # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary - # https://vocab.ceda.ac.uk/collection/cci/cci-content/dataType.html - # https://vocab.ceda.ac.uk/collection/cci/cci-content/dataType.json # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json - #segregator04: __vocabs__:esa_cci_file_name:segre04 # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) - # https://vocab.ceda.ac.uk/collection/cci/cci-content/product.html - # https://vocab.ceda.ac.uk/collection/cci/cci-content/product.json # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json - #segregator05: __vocabs__:esa_cci_file_name:segre05 - # Additional field - #field06: "RegEx_Additional_segre" - #segregator06: __vocabs__:esa_cci_file_name:segre06 - # Date (and time?) - #field07: "RegEx_Date_and_RegEx_Time" - #segregator07: __vocabs__:esa_cci_file_name:segre07 + # Date and time + field05: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S # File version - #field08: "RegEx_file_version" + field06: __version__:^fv\d?\d.?\d?\d?$ segregator: seg: '-' extension: diff --git a/specs/groups/esa-cci-attributes.yml b/specs/groups/esa-cci-global-attrs.yml similarity index 62% rename from specs/groups/esa-cci-attributes.yml rename to specs/groups/esa-cci-global-attrs.yml index 7e4b70a..dd51bf7 100644 --- a/specs/groups/esa-cci-attributes.yml +++ b/specs/groups/esa-cci-global-attrs.yml @@ -1,6 +1,3 @@ -#required-filename: -# func: checksit.generic.check_file_name - required-global-attrs0: func: checksit.generic.check_global_attrs params: @@ -43,24 +40,6 @@ required-global-attrs0: - spatial_resolution - key_variables vocab_attrs: - Conventions: __vocabs__:esa_cci_global_attrs:Conventions - project: __vocabs__:esa_cci_global_attrs:project - license: __vocabs__:esa_cci_global_attrs:license - -var-requires1: - func: checksit.generic.check_var - params: - variable: - - latitude - defined_attrs: - - units:degree_north - - standard_name:latitude - -var-requires3: - func: checksit.generic.check_var - params: - variable: - - longitude - defined_attrs: - - units:degree_east - - standard_name:longitude + Conventions: __vocabs__:esa-cci-global-attrs-config:Conventions + project: __vocabs__:esa-cci-global-attrs-config:project + license: __vocabs__:esa-cci-global-attrs-config:license diff --git a/specs/groups/esa-cci-variable-attrs.yml b/specs/groups/esa-cci-variable-attrs.yml new file mode 100644 index 0000000..f5910f3 --- /dev/null +++ b/specs/groups/esa-cci-variable-attrs.yml @@ -0,0 +1,17 @@ +var-requires1: + func: checksit.generic.check_var + params: + variable: + - latitude + defined_attrs: + - units:degree_north + - standard_name:latitude + +var-requires3: + func: checksit.generic.check_var + params: + variable: + - longitude + defined_attrs: + - units:degree_east + - standard_name:longitude From 4a7de35dbee77459dbc8dc65d531977aa0d45f89 Mon Sep 17 00:00:00 2001 From: knappett Date: Mon, 4 Nov 2024 15:32:33 +0000 Subject: [PATCH 03/11] Moved esa cci yaml files into the esa-cci-v1.0 subdirectory. --- specs/groups/{ => esa-cci-v1.0}/esa-cci-file-name.yml | 0 specs/groups/{ => esa-cci-v1.0}/esa-cci-global-attrs.yml | 0 specs/groups/{ => esa-cci-v1.0}/esa-cci-variable-attrs.yml | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename specs/groups/{ => esa-cci-v1.0}/esa-cci-file-name.yml (100%) rename specs/groups/{ => esa-cci-v1.0}/esa-cci-global-attrs.yml (100%) rename specs/groups/{ => esa-cci-v1.0}/esa-cci-variable-attrs.yml (100%) diff --git a/specs/groups/esa-cci-file-name.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name.yml similarity index 100% rename from specs/groups/esa-cci-file-name.yml rename to specs/groups/esa-cci-v1.0/esa-cci-file-name.yml diff --git a/specs/groups/esa-cci-global-attrs.yml b/specs/groups/esa-cci-v1.0/esa-cci-global-attrs.yml similarity index 100% rename from specs/groups/esa-cci-global-attrs.yml rename to specs/groups/esa-cci-v1.0/esa-cci-global-attrs.yml diff --git a/specs/groups/esa-cci-variable-attrs.yml b/specs/groups/esa-cci-v1.0/esa-cci-variable-attrs.yml similarity index 100% rename from specs/groups/esa-cci-variable-attrs.yml rename to specs/groups/esa-cci-v1.0/esa-cci-variable-attrs.yml From b914fc02805d8c8adea4faa8070c27d8d586e041 Mon Sep 17 00:00:00 2001 From: knappett Date: Mon, 4 Nov 2024 17:04:35 +0000 Subject: [PATCH 04/11] Modified generic.py with try/except statement for checking filename fields so tha ant error messageis shown if the number of fields in the yaml file is exceeded. --- checksit/generic.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/checksit/generic.py b/checksit/generic.py index 7a63763..91b08e5 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -580,7 +580,13 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten print('') print(idx, key) num=f"{idx:02}" - field=vocab_checks["field"+num] + try: + field=vocab_checks["field"+num] + except: + errors.append( + f"[file name]: Invalid number of file name fields - 'field{num}' not defined in yaml config file." + ) + print(errors[-1]) if field.startswith('__vocabs__') or field.startswith('__URL__'): if ( @@ -588,9 +594,9 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten != [] ): errors.append( - f"[file name]: Invalid file name format - unknown field '{key}'" + f"[file name]: Unknown field '{key}' in vocab {field}" ) - print(errors) + print(errors[-1]) elif field.startswith('__date__'): datefmts=(field.split(":"))[1] fmts=(datefmts.split(",")) @@ -598,8 +604,9 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten if not DATE_REGEX_GENERIC.match(key): errors.append( - f"[file name]: Invalid file name format - bad date format '{key}'" + f"[file name]: Expecting date/time - bad date format '{key}'" ) + print(errors[-1]) else: valid_date_found = False for f in fmts: @@ -613,23 +620,23 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten print(f"Date string {key} matches the required format") else: errors.append( - f"[file name]: Invalid file name format - invalid date in file name '{key}'" + f"[file name]: Invalid date/time string '{key}'" ) - print(errors) + print(errors[-1]) elif field.startswith('__version__'): verfmt=(field.split(":"))[1] if re.match(verfmt, key): print(f"File version {key} matches the required format") else: errors.append( - f"[file name]: Invalid file name format - invalid file version in file name '{key}'" + f"[file name]: Invalid file version '{key}'" ) - print(errors) + print(errors[-1]) else: errors.append( f"[file name]: {field} field type not recognised" ) - print(errors) + print(errors[-1]) return errors, warnings From 445b0a40fb67a84c135c3b58cfddf4e883f5aeee Mon Sep 17 00:00:00 2001 From: knappett Date: Tue, 5 Nov 2024 13:44:07 +0000 Subject: [PATCH 05/11] Updated check_generic_file_name with a check of the number of file name parts against the number defined in the user supplied yaml file, in place of a try-except statment. --- checksit/generic.py | 102 +++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/checksit/generic.py b/checksit/generic.py index 91b08e5..58f3542 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -580,63 +580,79 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten print('') print(idx, key) num=f"{idx:02}" - try: - field=vocab_checks["field"+num] - except: + + # Check if number of file name parts matches the number of fields specified in the user-defined yaml file + if len(vocab_checks) < len(file_name_parts): errors.append( - f"[file name]: Invalid number of file name fields - 'field{num}' not defined in yaml config file." - ) + f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected." + ) print(errors[-1]) + break + elif len(vocab_checks) > len(file_name_parts): + errors.append( + f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected." + ) + print(errors[-1]) + break + else: + field=vocab_checks["field"+num] - if field.startswith('__vocabs__') or field.startswith('__URL__'): - if ( - vocabs.check(field, key) - != [] - ): + if field.startswith('__vocabs__') or field.startswith('__URL__'): + # VOCAB (config or URL) + if ( + vocabs.check(field, key) + != [] + ): + errors.append( + f"[file name]: Unknown field '{key}' in vocab {field}" + ) + print(errors[-1]) + + elif field.startswith('__date__'): + # DATE REGEX + datefmts=(field.split(":"))[1] + fmts=(datefmts.split(",")) + print(f"Valid date formats: {fmts}") + + if not DATE_REGEX_GENERIC.match(key): errors.append( - f"[file name]: Unknown field '{key}' in vocab {field}" + f"[file name]: Expecting date/time - bad date format '{key}'" ) print(errors[-1]) - elif field.startswith('__date__'): - datefmts=(field.split(":"))[1] - fmts=(datefmts.split(",")) - print(f"Valid date formats: {fmts}") + else: + valid_date_found = False + for f in fmts: + try: + t = dt.datetime.strptime(key, f) + valid_date_found = True + break + except ValueError: + pass + if valid_date_found: + print(f"Date string {key} matches the required format") + else: + errors.append( + f"[file name]: Invalid date/time string '{key}'" + ) + print(errors[-1]) - if not DATE_REGEX_GENERIC.match(key): - errors.append( - f"[file name]: Expecting date/time - bad date format '{key}'" - ) - print(errors[-1]) - else: - valid_date_found = False - for f in fmts: - try: - t = dt.datetime.strptime(key, f) - valid_date_found = True - break - except ValueError: - pass - if valid_date_found: - print(f"Date string {key} matches the required format") + elif field.startswith('__version__'): + # FILE/PRODUCT VERSION + verfmt=(field.split(":"))[1] + if re.match(verfmt, key): + print(f"File version {key} matches the required format") else: errors.append( - f"[file name]: Invalid date/time string '{key}'" + f"[file name]: Invalid file version '{key}'" ) print(errors[-1]) - elif field.startswith('__version__'): - verfmt=(field.split(":"))[1] - if re.match(verfmt, key): - print(f"File version {key} matches the required format") + else: + # FIELD NOT RECOGNISED errors.append( - f"[file name]: Invalid file version '{key}'" - ) + f"[file name]: {field} field type not recognised" + ) print(errors[-1]) - else: - errors.append( - f"[file name]: {field} field type not recognised" - ) - print(errors[-1]) return errors, warnings From 80db5cbeb536e61d5e4cdeda83177407db0e9ea7 Mon Sep 17 00:00:00 2001 From: knappett Date: Tue, 5 Nov 2024 13:45:20 +0000 Subject: [PATCH 06/11] Added test_check_generic_file_name to test_generic.py. --- tests/test_generic.py | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_generic.py b/tests/test_generic.py index 1ef8d27..be4501a 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -463,6 +463,53 @@ def test_check_file_name(): assert errors == [] assert warnings == [] +def test_check_generic_file_name(): + # Test that the function correctly identifies invalid instrument name + vocab_checks = { + 'field00': '__vocabs__:esa-cci-file-name-config:field00', + 'field01': '__vocabs__:esa-cci-file-name-config:field01', + 'field02': '__vocabs__:esa-cci-file-name-config:field02', + 'field03': '__URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json', + 'field04': '__URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json', + 'field05': '__date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S', + 'field06': '__version__:^fv\\d?\\d.?\\d?\\d?$' + } + segregator = { + 'seg': '-' + } + extension = { + 'ext': '.nc' + } + + # Legitimate file name - should pass wihtout error + file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20231231000000-fv09.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == [] + assert warnings == [] + + # Incorrect field00 + file_name = "ESAC3S-SOILMOISTURE-L3S-SSMV-COMBINED-20231231000000-fv09.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == ["[file name]: Unknown field 'ESAC3S' in vocab __vocabs__:esa-cci-file-name-config:field00"] + assert warnings == [] + + # Incorrect multiple fields + file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-SSS-COMBINED-20231231000000-fv09.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == ["[file name]: Number of file name fields (8) is greater than the 7 fields expected."] + assert warnings == [] + + # Incorrect date + file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20231241000000-fv09.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == ["[file name]: Invalid date/time string '20231241000000'"] + assert warnings == [] + + # Incorrect version format + file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20231231000000-fv09.2.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == ["[file name]: Invalid file version 'fv09.2.1'"] + assert warnings == [] def test_check_radar_moment_variables(): dct = { From f0ef22c8e5c914c90596e013c1d0b6145478d466 Mon Sep 17 00:00:00 2001 From: knappett Date: Mon, 18 Nov 2024 16:53:44 +0000 Subject: [PATCH 07/11] Added keyword spec_verbose to check_generic_file_name to ensure that specs comparison information is only printed to screen when spec_verbose is set in the yaml file. --- checksit/cvs.py | 8 ++- checksit/generic.py | 66 ++++++++++++------- .../groups/esa-cci-v1.0/esa-cci-file-name.yml | 2 + 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/checksit/cvs.py b/checksit/cvs.py index 2ffd117..d425f59 100644 --- a/checksit/cvs.py +++ b/checksit/cvs.py @@ -112,11 +112,12 @@ def lookup(self, vocab_lookup): return obj - def check(self, vocab_lookup, value, label="", lookup=True): + def check(self, vocab_lookup, value, label="", lookup=True, spec_verb=False): # Return a list of errors - empty list if no errors errors = [] options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0] - print(f"Vocab lookup: {vocab_lookup}") + if spec_verb: + print(f"Vocab lookup: {vocab_lookup}") if isinstance(options, list): if value not in options: @@ -124,7 +125,8 @@ def check(self, vocab_lookup, value, label="", lookup=True): f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')" ) else: - print(f"Value: {value} is in list {options}") + if spec_verb: + print(f"Value: {value} is in list {options}") elif isinstance(options, dict): for key in options.keys(): if key in value.keys(): diff --git a/checksit/generic.py b/checksit/generic.py index 58f3542..d5738de 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -554,31 +554,43 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): return errors, warnings -def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, **kwargs): +def check_generic_file_name(file_name, vocab_checks=None, segregator=None, extension=None, spec_verbose=False, **kwargs): # Requires yaml file containing a list of file name fields and segregators # Loop over each file field and segregator until there are no more # check against defined file extension vocab_checks = vocab_checks or {} - seg = segregator["seg"] or '_' #'-' - ext = extension["ext"] or '.test' #'.nc' + try: + seg = segregator["seg"] + except: + seg='_' + try: + ext = extension["ext"] + except: + ext = '.test' + try: + spec_verb = spec_verbose["spec_verb"] + except: + spec_verb = False + errors = [] warnings = [] + # get filename parts extracted_name = file_name.removesuffix(ext) file_name_parts = extracted_name.split(seg) - print(f"File name: {file_name}") - print(f"Segregator: {seg}") - print(f"Extension: {ext}") - print(f"All file name parts: {file_name_parts}") + if spec_verb: + print(f"File name: {file_name}") + print(f"Segregator: {seg}") + print(f"Extension: {ext}") + print(f"All file name parts: {file_name_parts}") # Loop over file name parts - # Assume fields in yml file are in the same order - for idx, key in enumerate(file_name_parts): - print('') - print(idx, key) + if spec_verb: + print('') + print(idx, key) num=f"{idx:02}" # Check if number of file name parts matches the number of fields specified in the user-defined yaml file @@ -586,13 +598,15 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten errors.append( f"[file name]: Number of file name fields ({len(file_name_parts)}) is greater than the {len(vocab_checks)} fields expected." ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) break elif len(vocab_checks) > len(file_name_parts): errors.append( f"[file name]: Number of file name fields ({len(file_name_parts)}) is less than the {len(vocab_checks)} fields expected." ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) break else: field=vocab_checks["field"+num] @@ -600,25 +614,28 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten if field.startswith('__vocabs__') or field.startswith('__URL__'): # VOCAB (config or URL) if ( - vocabs.check(field, key) + vocabs.check(field, key, spec_verb=spec_verb) != [] ): errors.append( f"[file name]: Unknown field '{key}' in vocab {field}" ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) elif field.startswith('__date__'): # DATE REGEX datefmts=(field.split(":"))[1] fmts=(datefmts.split(",")) - print(f"Valid date formats: {fmts}") + if spec_verb: + print(f"Valid date formats: {fmts}") if not DATE_REGEX_GENERIC.match(key): errors.append( f"[file name]: Expecting date/time - bad date format '{key}'" ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) else: valid_date_found = False for f in fmts: @@ -629,30 +646,35 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten except ValueError: pass if valid_date_found: - print(f"Date string {key} matches the required format") + if spec_verb: + print(f"Date string {key} matches the required format") else: errors.append( f"[file name]: Invalid date/time string '{key}'" ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) elif field.startswith('__version__'): # FILE/PRODUCT VERSION verfmt=(field.split(":"))[1] if re.match(verfmt, key): - print(f"File version {key} matches the required format") + if spec_verb: + print(f"File version {key} matches the required format") else: errors.append( f"[file name]: Invalid file version '{key}'" ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) else: # FIELD NOT RECOGNISED errors.append( f"[file name]: {field} field type not recognised" ) - print(errors[-1]) + if spec_verb: + print(errors[-1]) return errors, warnings diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name.yml index 8ec230c..8028be8 100644 --- a/specs/groups/esa-cci-v1.0/esa-cci-file-name.yml +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name.yml @@ -22,3 +22,5 @@ file-name-format: seg: '-' extension: ext: '.nc' + spec_verbose: + spec_verb: False From 0a478059e4034deda6702b5df53d33a3c84a2530 Mon Sep 17 00:00:00 2001 From: knappett Date: Wed, 11 Dec 2024 14:34:25 +0000 Subject: [PATCH 08/11] Added new ESA CCI filename specs files. Added more verbose error messages to generic.py and updated test_generic. --- checksit/generic.py | 8 ++--- .../esa-cci-file-name-add-seg.yml | 28 ++++++++++++++++ .../esa-cci-file-name-ghrsst-add-seg-gds.yml | 30 +++++++++++++++++ .../esa-cci-file-name-ghrsst-add-seg.yml | 28 ++++++++++++++++ .../esa-cci-file-name-ghrsst-gds.yml | 28 ++++++++++++++++ .../esa-cci-file-name-ghrsst-std.yml | 26 +++++++++++++++ tests/test_generic.py | 32 ++++++++++++++++--- 7 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml create mode 100644 specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml create mode 100644 specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml create mode 100644 specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml create mode 100644 specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-std.yml diff --git a/checksit/generic.py b/checksit/generic.py index d5738de..1fd31df 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -618,7 +618,7 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten != [] ): errors.append( - f"[file name]: Unknown field '{key}' in vocab {field}" + f"[file name]: Unknown field '{key}' in vocab {field}." ) if spec_verb: print(errors[-1]) @@ -650,7 +650,7 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten print(f"Date string {key} matches the required format") else: errors.append( - f"[file name]: Invalid date/time string '{key}'" + f"[file name]: Invalid date/time string '{key}'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional." ) if spec_verb: print(errors[-1]) @@ -663,7 +663,7 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten print(f"File version {key} matches the required format") else: errors.append( - f"[file name]: Invalid file version '{key}'" + f"[file name]: Invalid file version '{key}'. File versions should take the form n{{1,}}[.n{{1,}}]." ) if spec_verb: print(errors[-1]) @@ -671,7 +671,7 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten else: # FIELD NOT RECOGNISED errors.append( - f"[file name]: {field} field type not recognised" + f"[file name]: {field} field type not recognised." ) if spec_verb: print(errors[-1]) diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml new file mode 100644 index 0000000..a0d2e7f --- /dev/null +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name-add-seg.yml @@ -0,0 +1,28 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # ESACCI + field00: __vocabs__:esa-cci-file-name-config:field00 + # CCI Project (e.g. SEAICE) + field01: __vocabs__:esa-cci-file-name-config:field01 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa-cci-file-name-config:field02 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field03: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # Additional segregator (also stored in the 'product' vocabulary) + field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # Date and time + field06: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S + # File version + field07: __version__:^fv\d?\d.?\d?\d?$ + segregator: + seg: '-' + extension: + ext: '.nc' + spec_verbose: + spec_verb: True diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml new file mode 100644 index 0000000..ce8269b --- /dev/null +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg-gds.yml @@ -0,0 +1,30 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # Date and time + field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S + # ESACCI + field01: __vocabs__:esa-cci-file-name-config:field00 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa-cci-file-name-config:field02 + # CCI Project (e.g. SEAICE) + field03: __vocabs__:esa-cci-file-name-config:field01 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # Additional segregator (also stored in the 'product' vocabulary) + field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # GDS version + field07: __version__:^v\d?\d.?\d?\d?$ + # File version + field08: __version__:^fv\d?\d.?\d?\d?$ + segregator: + seg: '-' + extension: + ext: '.nc' + spec_verbose: + spec_verb: True diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml new file mode 100644 index 0000000..606d262 --- /dev/null +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-add-seg.yml @@ -0,0 +1,28 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # Date and time + field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S + # ESACCI + field01: __vocabs__:esa-cci-file-name-config:field00 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa-cci-file-name-config:field02 + # CCI Project (e.g. SEAICE) + field03: __vocabs__:esa-cci-file-name-config:field01 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # Additional segregator (also stored in the 'product' vocabulary) + field06: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # File version + field07: __version__:^fv\d?\d.?\d?\d?$ + segregator: + seg: '-' + extension: + ext: '.nc' + spec_verbose: + spec_verb: True diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml new file mode 100644 index 0000000..6941f2d --- /dev/null +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-gds.yml @@ -0,0 +1,28 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # Date and time + field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S + # ESACCI + field01: __vocabs__:esa-cci-file-name-config:field00 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa-cci-file-name-config:field02 + # CCI Project (e.g. SEAICE) + field03: __vocabs__:esa-cci-file-name-config:field01 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # GDS version + field07: __version__:^v\d?\d.?\d?\d?$ + # File version + field08: __version__:^fv\d?\d.?\d?\d?$ + segregator: + seg: '-' + extension: + ext: '.nc' + spec_verbose: + spec_verb: True diff --git a/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-std.yml b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-std.yml new file mode 100644 index 0000000..f042cc4 --- /dev/null +++ b/specs/groups/esa-cci-v1.0/esa-cci-file-name-ghrsst-std.yml @@ -0,0 +1,26 @@ +file-name-format: + func: checksit.generic.check_generic_file_name + params: + vocab_checks: + # Date and time + field00: __date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S + # ESACCI + field01: __vocabs__:esa-cci-file-name-config:field00 + # Processing Level (e.g. L3C) + field02: __vocabs__:esa-cci-file-name-config:field02 + # CCI Project (e.g. SEAICE) + field03: __vocabs__:esa-cci-file-name-config:field01 + # Data Type (e.g. SICONC): this is the 'alternative label' of those in the vocabulary + # https://vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + field04: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json + # Product String (e.g. NIMBUS5_ESMR-EASE2_NH) + # http://vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + field05: __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json + # File version + field06: __version__:^fv\d?\d.?\d?\d?$ + segregator: + seg: '-' + extension: + ext: '.nc' + spec_verbose: + spec_verb: True diff --git a/tests/test_generic.py b/tests/test_generic.py index be4501a..c41e314 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -464,7 +464,7 @@ def test_check_file_name(): assert warnings == [] def test_check_generic_file_name(): - # Test that the function correctly identifies invalid instrument name + # Test for Standard ESA CCI file name vocab_checks = { 'field00': '__vocabs__:esa-cci-file-name-config:field00', 'field01': '__vocabs__:esa-cci-file-name-config:field01', @@ -490,7 +490,7 @@ def test_check_generic_file_name(): # Incorrect field00 file_name = "ESAC3S-SOILMOISTURE-L3S-SSMV-COMBINED-20231231000000-fv09.1.nc" errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) - assert errors == ["[file name]: Unknown field 'ESAC3S' in vocab __vocabs__:esa-cci-file-name-config:field00"] + assert errors == ["[file name]: Unknown field 'ESAC3S' in vocab __vocabs__:esa-cci-file-name-config:field00."] assert warnings == [] # Incorrect multiple fields @@ -502,13 +502,37 @@ def test_check_generic_file_name(): # Incorrect date file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20231241000000-fv09.1.nc" errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) - assert errors == ["[file name]: Invalid date/time string '20231241000000'"] + assert errors == ["[file name]: Invalid date/time string '20231241000000'. Date/time should take the form YYYY[MM[DD[HH[MM[SS]]]]], where the fields in brackets are optional."] assert warnings == [] # Incorrect version format file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20231231000000-fv09.2.1.nc" errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) - assert errors == ["[file name]: Invalid file version 'fv09.2.1'"] + assert errors == ["[file name]: Invalid file version 'fv09.2.1'. File versions should take the form n{1,}[.n{1,}]."] + assert warnings == [] + + # Test for Additional Segregator ESA CCI file name + vocab_checks = { + 'field00': '__vocabs__:esa-cci-file-name-config:field00', + 'field01': '__vocabs__:esa-cci-file-name-config:field01', + 'field02': '__vocabs__:esa-cci-file-name-config:field02', + 'field03': '__URL__vocab.ceda.ac.uk/scheme/cci/cci-content/dataType.json', + 'field04': '__URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json', + 'field05': '__URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json', + 'field06': '__date__:%Y,%Y%m,%Y%m%d,%Y%m%d%H,%Y%m%d%H%M,%Y%m%d%H%M%S', + 'field07': '__version__:^fv\\d?\\d.?\\d?\\d?$' + } + segregator = { + 'seg': '-' + } + extension = { + 'ext': '.nc' + } + + # Legitimate Additional Segregator ESA CCI file name - should pass wihtout error + file_name = "ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-TEST_ADD_SEG-20231231000000-fv09.1.nc" + errors, warnings = cg.check_generic_file_name(file_name, vocab_checks, segregator, extension) + assert errors == ["[file name]: Unknown field 'TEST_ADD_SEG' in vocab __URL__vocab.ceda.ac.uk/scheme/cci/cci-content/product.json."] assert warnings == [] def test_check_radar_moment_variables(): From 5d1ef42ba0f060b1c9a787f0f2eec306452b5584 Mon Sep 17 00:00:00 2001 From: knappett Date: Wed, 11 Dec 2024 14:53:59 +0000 Subject: [PATCH 09/11] Fixed issue with removesuffix. --- checksit/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/checksit/generic.py b/checksit/generic.py index 1fd31df..7fe8771 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -577,7 +577,10 @@ def check_generic_file_name(file_name, vocab_checks=None, segregator=None, exten warnings = [] # get filename parts - extracted_name = file_name.removesuffix(ext) + if not isinstance(file_name,str): + raise ValueError + + extracted_name = file_name.replace(ext,'') file_name_parts = extracted_name.split(seg) if spec_verb: From 84ead53c1d32a4a63d4a995775724c3943daffa3 Mon Sep 17 00:00:00 2001 From: knappett Date: Tue, 17 Dec 2024 17:28:01 +0000 Subject: [PATCH 10/11] Fixed issue with vocab_list, changed function name to _load_from_url_github, and reinstated the if 'latest' statement in the correct place. Also removed some unnecessary if/else indentation. --- checksit/cvs.py | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/checksit/cvs.py b/checksit/cvs.py index d425f59..afb126f 100644 --- a/checksit/cvs.py +++ b/checksit/cvs.py @@ -25,51 +25,56 @@ def _load(self, vocab_id): vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json") self._vocabs[vocab_id] = json.load(open(vocab_file)) - def _load_from_url_ncas(self, vocab_id_url): + def _load_from_url_github(self, vocab_id_url: str): + vocab_list = [] vocab_id_url_base = vocab_id_url.split("/__latest__")[0] vocab_id_url_base = vocab_id_url_base.replace( "raw.githubusercontent.com", "github.com" ) - latest_version = requests.get( - f"{vocab_id_url_base}/releases/latest" - ).url.split("/")[-1] - vocab_id_url = vocab_id_url.replace("__latest__", latest_version) + if "/__latest__/" in vocab_id_url: + latest_version = requests.get( + f"{vocab_id_url_base}/releases/latest" + ).url.split("/")[-1] + vocab_id_url = vocab_id_url.replace("__latest__", latest_version) res = requests.get(vocab_id_url.replace("__URL__", "https://")) - if res.status_code == 200: - vocab_list = res.json() - else: + if res.status_code != 200: print(f"[WARNING] Failed to load vocab: {vocab_id_url}") + return vocab_list + vocab_list = res.json() + return vocab_list - def _load_from_url_esacci(self, vocab_id_url): + def _load_from_url_esacci(self, vocab_id_url: str): + vocab_list = [] res = requests.get(vocab_id_url) - if res.status_code == 200: - js = res.json() + if res.status_code != 200: + print(f"[WARNING] Failed to load vocab: {vocab_id_url}") + return vocab_list + js = res.json() - if 'dataType' in vocab_id_url: - vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")]) - elif 'product' in vocab_id_url: - vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")]) - else: - print('ESA CCI vocab url not recognised.') + if 'dataType' in vocab_id_url: + vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")]) + elif 'product' in vocab_id_url: + vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")]) else: - print(f"[WARNING] Failed to load vocab: {vocab_id_url}") + print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}") + import pdb; pdb.set_trace() + return vocab_list - def _load_from_url(self, vocab_id): + def _load_from_url(self, vocab_id: str): # Loads a specific vocabulary from a URL vocab_id_url = vocab_id.replace("__URL__", "https://") if ( vocab_id_url.startswith("https://raw.githubusercontent.com") - and "/__latest__/" in vocab_id_url ): - vocab_list=self._load_from_url_ncas(vocab_id_url) + vocab_list=self._load_from_url_github(vocab_id_url) elif ( vocab_id_url.startswith("https://vocab.ceda.ac.uk") ): vocab_list=self._load_from_url_esacci(vocab_id_url) else: - print(f"Vocabulary url provided is not recognised") + print(f"Vocabulary url provided is not recognised: {vocab_id_url}") self._vocabs[vocab_id] = vocab_list From 01f8438665fcdcd153ab894eefdc9c474bdc5b8a Mon Sep 17 00:00:00 2001 From: knappett Date: Tue, 17 Dec 2024 18:05:13 +0000 Subject: [PATCH 11/11] Bugfix - removed pdb.set_trace() call. --- checksit/cvs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/checksit/cvs.py b/checksit/cvs.py index afb126f..360494a 100644 --- a/checksit/cvs.py +++ b/checksit/cvs.py @@ -58,7 +58,6 @@ def _load_from_url_esacci(self, vocab_id_url: str): vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")]) else: print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}") - import pdb; pdb.set_trace() return vocab_list