diff --git a/checksit/check.py b/checksit/check.py index abf06b9e..2d0eaf6f 100644 --- a/checksit/check.py +++ b/checksit/check.py @@ -23,9 +23,19 @@ class Checker: - def __init__(self, template="auto", mappings=None, extra_rules=None, specs=None, ignore_attrs=None, - auto_cache=False, verbose=False, log_mode="standard", ignore_warnings=False, - skip_spellcheck=False): + def __init__( + self, + template="auto", + mappings=None, + extra_rules=None, + specs=None, + ignore_attrs=None, + auto_cache=False, + verbose=False, + log_mode="standard", + ignore_warnings=False, + skip_spellcheck=False, + ): self.template = template self.mappings = mappings or {} self.extra_rules = extra_rules or {} @@ -43,8 +53,9 @@ def _update_check_context(self, file_path, template): self._check_context["size"] = os.path.getsize(file_path) self._check_context["template"] = template - def _compare_items(self, rec, tmpl, key, label, mappings=None, - extra_rules=None, ignore_attrs=None): + def _compare_items( + self, rec, tmpl, key, label, mappings=None, extra_rules=None, ignore_attrs=None + ): mappings = mappings or self.mappings extra_rules = extra_rules or self.extra_rules @@ -61,55 +72,68 @@ def _compare_items(self, rec, tmpl, key, label, mappings=None, rec_key = mappings.get(key, key) if isinstance(tmpl[key], dict): - errors.extend(self._compare_dicts(rec, tmpl, key, mappings=mappings, ignore_attrs=ignore_attrs)) + errors.extend( + self._compare_dicts( + rec, tmpl, key, mappings=mappings, ignore_attrs=ignore_attrs + ) + ) else: tmpl_value = str(tmpl[key]) if label_key in conf["settings"]["excludes"]: pass elif key.startswith(f"{vocabs_prefix}:"): - errors.extend(vocabs.check(tmpl[key], rec.get(rec_key, UNDEFINED), label=label_key)) + errors.extend( + vocabs.check( + tmpl[key], rec.get(rec_key, UNDEFINED), label=label_key + ) + ) elif tmpl_value.startswith(f"{vocabs_prefix}:"): - errors.extend(vocabs.check(tmpl[key], rec.get(rec_key, UNDEFINED), label=label_key)) + errors.extend( + vocabs.check( + tmpl[key], rec.get(rec_key, UNDEFINED), label=label_key + ) + ) # Rule defined in template value elif tmpl_value.startswith(f"{rules_prefix}:"): - errors.extend(rules.check(tmpl[key], rec.get(rec_key, UNDEFINED), - context=self._check_context, label=label_key)) + errors.extend( + rules.check( + tmpl[key], + rec.get(rec_key, UNDEFINED), + context=self._check_context, + label=label_key, + ) + ) # Rule defined in `extra_rules` dictionary elif [rule for rule in extra_rules if rule.startswith(label_key)]: - rule_key = [rule for rule in extra_rules if rule.startswith(label_key)][0] + rule_key = [rule for rule in extra_rules if rule.startswith(label_key)][ + 0 + ] rule = extra_rules[rule_key] - errors.extend(rules.check(rule, rec.get(rec_key, UNDEFINED), - context=self._check_context, label=label_key)) + errors.extend( + rules.check( + rule, + rec.get(rec_key, UNDEFINED), + context=self._check_context, + label=label_key, + ) + ) # Else... elif tmpl[key] != rec.get(rec_key, UNDEFINED): - errors.append(f"{label_key}: '{rec.get(rec_key, UNDEFINED)}' does not match expected: '{tmpl[key]}'") + errors.append( + f"{label_key}: '{rec.get(rec_key, UNDEFINED)}' does not match expected: '{tmpl[key]}'" + ) return errors def _compare_dicts(self, record, template, label, mappings=None, ignore_attrs=None): mappings = mappings or self.mappings errors = [] - # list_types = [] #"variables" #- no longer used - as comparisons need key/values do_sort = False if label in ("dimensions", "global_attributes"): do_sort = True - # if label in list_types: - # tmpl = template[label] - # rec = record[label] - - # if len(tmpl) != len(rec): - # errors.append(f"[ERROR] Number of '{label}' items differs between template ({len(tmpl)}) and record ({len(rec)})") - # else: - # for i in range(len(tmpl)): - # t = tmpl[i] - # r = rec[i] - # for key in t: - # errors.extend(self.compare_items(r, t, key, label=label, mappings=mappings, ignore_attrs=ignore_attrs)) - # else: - # Recursively check dicts tmpl = template[label] rec_key = mappings.get(label, label) @@ -118,21 +142,40 @@ def _compare_dicts(self, record, template, label, mappings=None, ignore_attrs=No rec = record[rec_key] keys = tmpl.keys() - if do_sort: keys = sorted(keys) + if do_sort: + keys = sorted(keys) for key in keys: - errors.extend(self._compare_items(rec, tmpl, key, label=label, mappings=mappings, - ignore_attrs=ignore_attrs)) + errors.extend( + self._compare_items( + rec, + tmpl, + key, + label=label, + mappings=mappings, + ignore_attrs=ignore_attrs, + ) + ) else: errors.append(f"Expected item '{label}' not found in data file.") return errors - - def _check_file(self, file_content, template, mappings=None, extra_rules=None, specs=None, - ignore_attrs=None, log_mode="standard", fmt_errors=None, - ignore_warnings=False, skip_spellcheck=False): - + + def _check_file( + self, + file_content, + template, + mappings=None, + extra_rules=None, + specs=None, + ignore_attrs=None, + log_mode="standard", + fmt_errors=None, + ignore_warnings=False, + skip_spellcheck=False, + ): + if hasattr(file_content, "to_dict"): record = file_content.to_dict() @@ -151,10 +194,14 @@ def _check_file(self, file_content, template, mappings=None, extra_rules=None, s for spec in specs: sr = SpecificationChecker(spec) - if 'amof-file-name' in spec: - spec_errors, spec_warnings = sr.run_checks(file_content.inpt.split("/")[-1]) + if "amof-file-name" in spec: + spec_errors, spec_warnings = sr.run_checks( + file_content.inpt.split("/")[-1] + ) else: - spec_errors, spec_warnings = sr.run_checks(record, skip_spellcheck=skip_spellcheck) + spec_errors, spec_warnings = sr.run_checks( + record, skip_spellcheck=skip_spellcheck + ) errors.extend(spec_errors) warnings.extend(spec_warnings) @@ -164,7 +211,13 @@ def _check_file(self, file_content, template, mappings=None, extra_rules=None, s sections = "dimensions", "variables", "global_attributes" for section in sections: - errs = self._compare_dicts(record, template, section, mappings=mappings, ignore_attrs=ignore_attrs) + errs = self._compare_dicts( + record, + template, + section, + mappings=mappings, + ignore_attrs=ignore_attrs, + ) errors.extend([f"[{section}] {err}" for err in errs]) if log_mode == "compact": @@ -181,9 +234,11 @@ def _check_file(self, file_content, template, mappings=None, extra_rules=None, s endstr = "\n" number = 0 print(f"{highest} | {number} ", end=endstr) - err_string = " | ".join([err.replace("|", "__VERTICAL_BAR_REPLACED__") for err in errors]) + err_string = " | ".join( + [err.replace("|", "__VERTICAL_BAR_REPLACED__") for err in errors] + ) if err_string: - print(f"| {err_string}") + print(f"| {err_string}") else: if errors: @@ -202,12 +257,180 @@ def _check_file(self, file_content, template, mappings=None, extra_rules=None, s print(f"\t{count:02d}. {warning}") if compliant: - print("[INFO] File is compliant!") - - - def check_file(self, file_path, template="auto", mappings=None, extra_rules=None, specs=None, - ignore_attrs=None, auto_cache=False, verbose=False, log_mode="standard", - ignore_warnings=False, skip_spellcheck=False): + print("[INFO] File is compliant!") + + def _get_ncas_specs( + self, file_path, file_content, log_mode="standard", verbose=False + ): + template = "auto" + specs = None + # find appropriate specs depending on convention + if file_path.split(".")[-1] == "nc" and ":Conventions" in file_content.cdl: + conventions = ( + file_content.cdl.split(":Conventions =")[1].split(";")[0].strip() + ) + # NCAS-GENERAL file + if any( + name in conventions + for name in ["NCAS-GENERAL", "NCAS-AMF", "NCAS-AMOF"] + ): + if verbose: + print("\nNCAS-AMOF file detected, finding correct spec files") + print("Finding correct AMOF version...") + version_number = ( + conventions[conventions.index("NCAS-") :] + .split("-")[2] + .replace('"', "") + ) + spec_folder = f"ncas-amof-{version_number}" + if verbose: + print(f" {version_number}") + + # check specs exist for that version + specs_dir = os.path.join( + conf["settings"].get("specs_dir", "./specs"), + f"groups/{spec_folder}", + ) + if not os.path.exists(specs_dir): + if verbose: + print( + f"Specs for version NCAS-GENERAL-{version_number} not found, attempting download..." + ) + try: + vocabs_dir = os.path.join( + conf["settings"].get("vocabs_dir", "./checksit/vocabs"), + f"AMF_CVs/{version_number}", + ) + cvs = urllib.request.urlopen( + f"https://github.com/ncasuk/AMF_CVs/tree/v{version_number}/AMF_CVs" + ) + data = cvs.readlines() + if not os.path.exists(specs_dir): + os.mkdir(specs_dir) + if not os.path.exists(vocabs_dir): + os.mkdir(vocabs_dir) + for line in data: + if ( + f'href="/ncasuk/AMF_CVs/blob/v{version_number}/AMF_CVs' + in line.decode() + ): + json_file = ( + line.decode().split('href="')[1].split('">')[0] + ) + if json_file.startswith("/ncasuk/AMF_CVs/blob/"): + cv = urllib.request.urlopen( + f"https://raw.githubusercontent.com{json_file.replace('/blob','')}" + ) + json_file_name = json_file.split("/")[-1] + with open( + f"{vocabs_dir}/{json_file_name}", "w" + ) as f: + _ = f.write(cv.read().decode()) + make_amof_specs(version_number) + if verbose: + print(" Downloaded of specs successful") + except urllib.error.HTTPError: + if log_mode == "compact": + print( + f"{file_path} | ABORTED | FATAL | Cannot download data for NCAS-AMOF-{version_number}" + ) + else: + print( + f"[ERROR]: Cannot download data for NCAS-AMOF-{version_number}." + ) + print("Aborting...") + sys.exit() + except PermissionError: + if log_mode == "compact": + print( + f"{file_path} | ABORTED | FATAL | Permission Error when trying to create folders or files within checksit." + ) + else: + print( + f"[ERROR]: Permission Error when trying to create folders or files within checksit." + ) + print( + f"Please talk to your Admin about installing data for NCAS-AMOF-{version_number}." + ) + sys.exit() + except: + raise + + # get deployment mode and data product, to then get specs + deployment_mode = ( + file_content.cdl.split(":deployment_mode =")[1] + .split(";")[0] + .strip() + .strip('"') + ) + deploy_spec = f"{spec_folder}/amof-common-{deployment_mode}" + product = file_path.split("/")[-1].split("_")[3] + product_spec = f"{spec_folder}/amof-{product}" + specs = [ + f"{spec_folder}/amof-file-name", + deploy_spec, + product_spec, + f"{spec_folder}/amof-global-attrs", + ] + # don't need to do template check + template = "off" + + # NCAS-RADAR (coming soon...) + # if "NCAS-Radar" in conventions + + elif ( + file_path.split(".")[-1].lower() in IMAGE_EXTENSIONS + and "XMP-photoshop:Instructions" in file_content.global_attrs.keys() + ): + conventions = file_content.global_attrs["XMP-photoshop:Instructions"] + if ( + "National Centre for Atmospheric Science Image Metadata Standard" + in file_content.global_attrs["XMP-photoshop:Instructions"].replace( + "\n", " " + ) + ): + if verbose: + print("\nNCAS-IMAGE file detected, finding correct spec files") + print("Finding correct IMAGE version...") + version_number = ( + conventions.replace("\n", " ") + .split("Metadata Standard ")[1] + .split(":")[0] + ) + spec_folder = f"ncas-image-{version_number}" + if verbose: + print(f" {version_number}") + specs_dir = os.path.join( + conf["settings"].get("specs_dir", "./specs"), + f"groups/{spec_folder}", + ) + if not os.path.exists(specs_dir): + print( + f"[ERROR] specs for NCAS-IMAGE {version_number} can not be found." + ) + print("Aborting...") + sys.exit() + product = file_path.split("/")[-1].split("_")[3] + product_spec = f"{spec_folder}/amof-{product}" + specs = [product_spec, f"{spec_folder}/amof-image-global-attrs"] + template = "off" + + return template, specs + + def check_file( + self, + file_path, + template="auto", + mappings=None, + extra_rules=None, + specs=None, + ignore_attrs=None, + auto_cache=False, + verbose=False, + log_mode="standard", + ignore_warnings=False, + skip_spellcheck=False, + ): try: fp = FileParser() @@ -219,112 +442,25 @@ def check_file(self, file_path, template="auto", mappings=None, extra_rules=None else: raise Exception(err) - # if template == "auto": - # template = self._template_from_config(file_path, verbose) - # elif not os.path.isfile(template): - # if log_mode == "compact": - # print(f"{file_path} | ABORTED | FATAL | Cannot find template file specified") - # sys.exit(1) - # else: - # raise Exception(f"Cannot find specified template file: {template}") - - # tmpl = self.parse_file_header(template, auto_cache=auto_cache, verbose=verbose) - ### Check for NCAS data files and gather specs ### # if template and specs are "default" values, check to see if # file is an ncas file (assuming file name starts with instrument name) - if (template == "auto" and specs == None and - file_path.split("/")[-1].startswith("ncas-")): - # find appropriate specs depending on convention - if file_path.split(".")[-1] == "nc" and ":Conventions" in file_content.cdl: - conventions = file_content.cdl.split(":Conventions =")[1].split(";")[0].strip() - # NCAS-GENERAL file - if any(name in conventions for name in ["NCAS-GENERAL", "NCAS-AMF", "NCAS-AMOF"]): - if verbose: - print("\nNCAS-AMOF file detected, finding correct spec files") - print("Finding correct AMOF version...") - version_number = conventions[conventions.index("NCAS-"):].split("-")[2].replace('"','') - spec_folder = f"ncas-amof-{version_number}" - if verbose: print(f" {version_number}") - - # check specs exist for that version - specs_dir = os.path.join(conf["settings"].get("specs_dir", "./specs"), f"groups/{spec_folder}") - if not os.path.exists(specs_dir): - if verbose: print(f"Specs for version NCAS-GENERAL-{version_number} not found, attempting download...") - try: - vocabs_dir = os.path.join(conf["settings"].get("vocabs_dir", "./checksit/vocabs"), f"AMF_CVs/{version_number}") - cvs = urllib.request.urlopen(f"https://github.com/ncasuk/AMF_CVs/tree/v{version_number}/AMF_CVs") - data = cvs.readlines() - if not os.path.exists(specs_dir): - os.mkdir(specs_dir) - if not os.path.exists(vocabs_dir): - os.mkdir(vocabs_dir) - for line in data: - if f'href="/ncasuk/AMF_CVs/blob/v{version_number}/AMF_CVs' in line.decode(): - json_file = line.decode().split('href="')[1].split('">')[0] - if json_file.startswith("/ncasuk/AMF_CVs/blob/"): - cv = urllib.request.urlopen(f"https://raw.githubusercontent.com{json_file.replace('/blob','')}") - json_file_name = json_file.split("/")[-1] - with open(f"{vocabs_dir}/{json_file_name}", "w") as f: - _ = f.write(cv.read().decode()) - make_amof_specs(version_number) - if verbose: print(" Downloaded of specs successful") - except urllib.error.HTTPError: - if log_mode == "compact": - print(f"{file_path} | ABORTED | FATAL | Cannot download data for NCAS-AMOF-{version_number}") - else: - print(f"[ERROR]: Cannot download data for NCAS-AMOF-{version_number}.") - print("Aborting...") - sys.exit() - except PermissionError: - if log_mode == "compact": - print(f"{file_path} | ABORTED | FATAL | Permission Error when trying to create folders or files within checksit.") - else: - print(f"[ERROR]: Permission Error when trying to create folders or files within checksit.") - print(f"Please talk to your Admin about installing data for NCAS-AMOF-{version_number}.") - sys.exit() - except: - raise - - # get deployment mode and data product, to then get specs - deployment_mode = file_content.cdl.split(':deployment_mode =')[1].split(';')[0].strip().strip('"') - deploy_spec = f'{spec_folder}/amof-common-{deployment_mode}' - product = file_path.split('/')[-1].split('_')[3] - product_spec = f'{spec_folder}/amof-{product}' - specs = [f'{spec_folder}/amof-file-name', deploy_spec, product_spec, f'{spec_folder}/amof-global-attrs'] - # don't need to do template check - template = "off" - - # NCAS-RADAR (coming soon...) - # if "NCAS-Radar" in conventions - - elif (file_path.split(".")[-1].lower() in IMAGE_EXTENSIONS and - "XMP-photoshop:Instructions" in file_content.global_attrs.keys()): - conventions = file_content.global_attrs["XMP-photoshop:Instructions"] - if "National Centre for Atmospheric Science Image Metadata Standard" in file_content.global_attrs["XMP-photoshop:Instructions"].replace("\n"," "): - if verbose: - print("\nNCAS-IMAGE file detected, finding correct spec files") - print("Finding correct IMAGE version...") - version_number = conventions.replace("\n"," ").split("Metadata Standard ")[1].split(":")[0] - spec_folder = f"ncas-image-{version_number}" - if verbose: print(f" {version_number}") - specs_dir = os.path.join(conf["settings"].get("specs_dir", "./specs"), f"groups/{spec_folder}") - if not os.path.exists(specs_dir): - print(f"[ERROR] specs for NCAS-IMAGE {version_number} can not be found.") - print("Aborting...") - sys.exit() - product = file_path.split('/')[-1].split('_')[3] - product_spec = f"{spec_folder}/amof-{product}" - specs = [product_spec, f"{spec_folder}/amof-image-global-attrs"] - template = "off" - - + if ( + template == "auto" + and specs == None + and file_path.split("/")[-1].startswith("ncas-") + ): + template, specs = self._get_ncas_specs( + file_path, file_content, log_mode=log_mode, verbose=verbose + ) if template == "off": tmpl = template tmpl_input = "OFF" else: - tm = TemplateManager(auto_cache=auto_cache, verbose=verbose, log_mode=log_mode) + tm = TemplateManager( + auto_cache=auto_cache, verbose=verbose, log_mode=log_mode + ) tmpl = tm.get(file_path, template=template) tmpl_input = tmpl.inpt @@ -341,11 +477,21 @@ def check_file(self, file_path, template="auto", mappings=None, extra_rules=None if log_mode == "compact": print(f"{file_path} | {tmpl_input} | ", end="") else: - print(f"\nRunning with:\n\tTemplate: {tmpl_input}\n\tSpec Files: {specs}\n\tDatafile: {file_content.inpt}") - - self._check_file(file_content, template=tmpl, mappings=mappings, extra_rules=extra_rules, - specs=specs, ignore_attrs=ignore_attrs, log_mode=log_mode, - ignore_warnings=ignore_warnings, skip_spellcheck=skip_spellcheck) + print( + f"\nRunning with:\n\tTemplate: {tmpl_input}\n\tSpec Files: {specs}\n\tDatafile: {file_content.inpt}" + ) + + self._check_file( + file_content, + template=tmpl, + mappings=mappings, + extra_rules=extra_rules, + specs=specs, + ignore_attrs=ignore_attrs, + log_mode=log_mode, + ignore_warnings=ignore_warnings, + skip_spellcheck=skip_spellcheck, + ) class TemplateManager: @@ -360,13 +506,17 @@ def get(self, file_path, template="auto"): template = self._get_template_from_config(file_path) elif not os.path.isfile(template): if self.log_mode == "compact": - print(f"{file_path} | ABORTED | FATAL | Cannot find template file specified") + print( + f"{file_path} | ABORTED | FATAL | Cannot find template file specified" + ) sys.exit(1) else: raise Exception(f"Cannot find specified template file: {template}") fp = FileParser() - tmpl = fp.parse_file_header(template, auto_cache=self.auto_cache, verbose=self.verbose) + tmpl = fp.parse_file_header( + template, auto_cache=self.auto_cache, verbose=self.verbose + ) return tmpl def _get_template_from_config(self, file_path): @@ -377,17 +527,19 @@ def _get_template_from_config(self, file_path): config = conf[f"dataset:{dset}"] if "regex_path" in config and re.search(config["regex_path"], file_path): - return self._get_template_by_dataset(file_path, config) - elif "regex_file" in config and re.match(config["regex_file"], os.path.basename(file_path)): + return self._get_template_by_dataset(file_path, config) + elif "regex_file" in config and re.match( + config["regex_file"], os.path.basename(file_path) + ): return self._get_template_by_dataset(file_path, config) else: - return self._get_template_from_cache(file_path) + return self._get_template_from_cache(file_path) def _get_template_by_dataset(self, file_path, config): if "template" in config: return config["template"] elif "template_cache" in config: - return self._get_template_from_cache(file_path, config["template_cache"]) + return self._get_template_from_cache(file_path, config["template_cache"]) else: raise Exception("No rule for finding the template") @@ -397,20 +549,23 @@ def _get_template_from_cache(self, file_path, template_cache=None): tmpl_base = get_file_base(file_path) - if self.verbose: print(f"[INFO] Searching for exact match for: {tmpl_base}") + if self.verbose: + print(f"[INFO] Searching for exact match for: {tmpl_base}") matches = glob.glob(f"{template_cache}/{tmpl_base}_*.cdl") if matches: - match = matches[0] - if self.verbose: print(f"[INFO] Found exact match: {match}") + match = matches[0] + if self.verbose: + print(f"[INFO] Found exact match: {match}") else: - if self.verbose: print("[WARNING] Failed to find exact match, so trying nearest...") + if self.verbose: + print("[WARNING] Failed to find exact match, so trying nearest...") templates = os.listdir(template_cache) matches = difflib.get_close_matches(tmpl_base, templates) if matches: match = os.path.join(template_cache, matches[0]) - else: + else: match = conf["settings"].get("default_template") if not match: @@ -433,7 +588,7 @@ def parse_file_header(self, file_path, auto_cache=False, verbose=False): elif ext in ("yml"): reader = yml elif ext.lower() in IMAGE_EXTENSIONS: - reader = image + reader = image else: raise Exception(f"No known reader for file with extension: {ext}") @@ -441,7 +596,9 @@ def parse_file_header(self, file_path, auto_cache=False, verbose=False): if auto_cache: base = os.path.splitext(os.path.basename(file_path))[0] - output_path = os.path.join(conf["settings"]["default_template_cache_dir"], base) + output_path = os.path.join( + conf["settings"]["default_template_cache_dir"], base + ) if reader == cdl: # Special case for NetCDF files using CDL @@ -450,9 +607,14 @@ def parse_file_header(self, file_path, auto_cache=False, verbose=False): else: # All others use YAML with open(f"{output_path}.yml", "w") as writer: - yaml.dump(content.to_dict(), writer, Dumper=yaml.SafeDumper, - default_flow_style=False, sort_keys=False) - + yaml.dump( + content.to_dict(), + writer, + Dumper=yaml.SafeDumper, + default_flow_style=False, + sort_keys=False, + ) + return content diff --git a/checksit/cli.py b/checksit/cli.py index 6ed9b6bf..17d16d25 100644 --- a/checksit/cli.py +++ b/checksit/cli.py @@ -1,11 +1,12 @@ """Console script for checksit.""" __author__ = """Ag Stephens""" -__contact__ = 'ag.stephens@stfc.ac.uk' +__contact__ = "ag.stephens@stfc.ac.uk" __copyright__ = "Copyright 2022 United Kingdom Research and Innovation" __license__ = "BSD - see LICENSE file in top-level package directory" import click +import os from .utils import string_to_dict, string_to_list from .check import check_file @@ -13,6 +14,7 @@ from . import describer from . import specs + @click.group() def main(): """Console script for checker.""" @@ -35,12 +37,30 @@ def main(): @click.option("-t", "--template", default="auto") @click.option("-w", "--ignore-warnings", is_flag=True) @click.option("-p", "--skip-spellcheck", is_flag=True) -def check(file_path, mappings=None, rules=None, specs=None, ignore_attrs=None, ignore_all_globals=False, - ignore_all_dimensions=False, ignore_all_variables=False, ignore_all_variable_attrs=False, - auto_cache=False, log_mode="standard", verbose=False, template="auto", ignore_warnings=False, - skip_spellcheck=False): - - if ignore_all_globals or ignore_all_dimensions or ignore_all_variables or ignore_all_variable_attrs: +def check( + file_path, + mappings=None, + rules=None, + specs=None, + ignore_attrs=None, + ignore_all_globals=False, + ignore_all_dimensions=False, + ignore_all_variables=False, + ignore_all_variable_attrs=False, + auto_cache=False, + log_mode="standard", + verbose=False, + template="auto", + ignore_warnings=False, + skip_spellcheck=False, +): + + if ( + ignore_all_globals + or ignore_all_dimensions + or ignore_all_variables + or ignore_all_variable_attrs + ): raise Exception("Options not implemented yet!!!!!") if mappings: @@ -55,10 +75,19 @@ def check(file_path, mappings=None, rules=None, specs=None, ignore_attrs=None, i if ignore_attrs: ignore_attrs = string_to_list(ignore_attrs) - return check_file(file_path, template=template, mappings=mappings, extra_rules=rules, - specs=specs, ignore_attrs=ignore_attrs, - auto_cache=auto_cache, verbose=verbose, - log_mode=log_mode, ignore_warnings=ignore_warnings, skip_spellcheck=skip_spellcheck) + return check_file( + file_path, + template=template, + mappings=mappings, + extra_rules=rules, + specs=specs, + ignore_attrs=ignore_attrs, + auto_cache=auto_cache, + verbose=verbose, + log_mode=log_mode, + ignore_warnings=ignore_warnings, + skip_spellcheck=skip_spellcheck, + ) @main.command() @@ -68,28 +97,44 @@ def check(file_path, mappings=None, rules=None, specs=None, ignore_attrs=None, i @click.option("-x", "--exclude", default=None) @click.option("-e", "--exclude-file", default=None) @click.option("--verbose/--no-verbose", default=False) -def summary(log_files=None, log_directory=None, show_files=False, - exclude=None, exclude_file=None, - verbose=False): +def summary( + log_files=None, + log_directory=None, + show_files=False, + exclude=None, + exclude_file=None, + verbose=False, +): if exclude: - exclude = string_to_list(exclude) + exclude = string_to_list(exclude) else: exclude = [] - + if exclude_file: if not os.path.isfile(exclude_file): raise Exception(f"'--exclude-file' does not point to a valid file") with open(exclude_file) as exfile: - exclude.extend([exclude_pattern for exclude_pattern in exfile if exclude_pattern.strip()]) - - return summarise(log_files, log_directory=log_directory, show_files=show_files, - exclude=exclude, verbose=verbose) + exclude.extend( + [ + exclude_pattern + for exclude_pattern in exfile + if exclude_pattern.strip() + ] + ) + + return summarise( + log_files, + log_directory=log_directory, + show_files=show_files, + exclude=exclude, + verbose=verbose, + ) @main.command() -@click.argument("check_ids", nargs=-1, default=None) +@click.argument("check_ids", nargs=-1, default=None) @click.option("--verbose/--no-verbose", default=False) def describe(check_ids=None, verbose=False): return describer.describe(check_ids, verbose=verbose) @@ -104,4 +149,3 @@ def show_specs(spec_ids=None, verbose=False): if __name__ == "__main__": main() - diff --git a/checksit/cvs.py b/checksit/cvs.py index a8bea13c..23c9bac0 100644 --- a/checksit/cvs.py +++ b/checksit/cvs.py @@ -6,6 +6,7 @@ from .config import get_config + conf = get_config() vocabs_dir = conf["settings"]["vocabs_dir"] @@ -26,13 +27,20 @@ def _load(self, vocab_id): def _load_from_url(self, vocab_id): # Loads a specific vocabulary from a URL - vocab_id_url = vocab_id.replace("__URL__","https://") - if vocab_id_url.startswith("https://raw.githubusercontent.com") and "/__latest__/" in vocab_id_url: + vocab_id_url = vocab_id.replace("__URL__", "https://") + if ( + vocab_id_url.startswith("https://raw.githubusercontent.com") + and "/__latest__/" in vocab_id_url + ): vocab_id_url_base = vocab_id_url.split("/__latest__")[0] - vocab_id_url_base = vocab_id_url_base.replace("raw.githubusercontent.com","github.com") - latest_version = requests.get(f"{vocab_id_url_base}/releases/latest").url.split("/")[-1] + vocab_id_url_base = vocab_id_url_base.replace( + "raw.githubusercontent.com", "github.com" + ) + latest_version = requests.get( + f"{vocab_id_url_base}/releases/latest" + ).url.split("/")[-1] vocab_id_url = vocab_id_url.replace("__latest__", latest_version) - res = requests.get(vocab_id_url.replace("__URL__","https://")) + res = requests.get(vocab_id_url.replace("__URL__", "https://")) if res.status_code == 200: self._vocabs[vocab_id] = res.json() else: @@ -46,74 +54,66 @@ def __getitem__(self, vocab_id): else: self._load(vocab_id) - return self._vocabs[vocab_id] + return self._vocabs[vocab_id] def lookup(self, vocab_lookup): # A nested dictionary-style look-up using a string: vocab_lookup obj = self vocab_lookup = re.sub(f"^{vocabs_prefix}:", "", vocab_lookup) - for i,key in enumerate(vocab_lookup.split(":")): + for i, key in enumerate(vocab_lookup.split(":")): if isinstance(obj, dict) or i == 0: if key in WILDCARD: - if i+1 != len(vocab_lookup.split(":")): - obj = [ obj[key] for key in obj.keys() ] + if i + 1 != len(vocab_lookup.split(":")): + obj = [obj[key] for key in obj.keys()] else: # WILDCARD used as last option, just get keys obj = list(obj.keys()) else: obj = obj[key] else: - if not isinstance(obj,list): + if not isinstance(obj, list): # sanity check raise ValueError(f"Confused how we got here, obj = {obj}") elif key in WILDCARD: - raise ValueError(f"Second WILDCARD ({WILDCARD}) in query {vocab_lookup} not allowed") + raise ValueError( + f"Second WILDCARD ({WILDCARD}) in query {vocab_lookup} not allowed" + ) else: # obj should be list of dicts, creating list of values or dicts - obj = [ d[key] for d in obj ] + obj = [d[key] for d in obj] return obj -# def OLD_lookup(self, lookup): -# Used to have a special "__key__" lookup. not needed now. -# # Parses a lookup string (from a template) and then looks up the vocabulary -# # to return an item or a list of items -# lookup = re.sub("^__vocabs__:", "", lookup) -# self._load_vocab(lookup) -# comps = deque(lookup.split(":")) -# item = self.vocabs - -# while comps: -# comp = comps.popleft() -# if comp == "__key__": -# item = item.keys() -# elif isinstance(item, list): -# item = [i[comp] for i in item if i.get(comp)] -# else: -# item = item[comp] - -# return item - def check(self, vocab_lookup, value, label="", lookup=True): # Return a list of errors - empty list if no errors errors = [] - options = [ self.lookup(vocab_lookup) if lookup else vocab_lookup ][0] + options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0] if isinstance(options, list): if value not in options: - errors.append(f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')") + errors.append( + f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')" + ) elif isinstance(options, dict): for key in options.keys(): if key in value.keys(): - errors.extend(self.check(options[key], value[key], label = f"{label}:{key}", lookup=False)) + errors.extend( + self.check( + options[key], + value[key], + label=f"{label}:{key}", + lookup=False, + ) + ) else: errors.append(f"{label} does not have attribute '{key}'") elif value != options: - errors.append(f"{label} '{value}' does not equal required vocab value: '{options}' (using: '{vocab_lookup}')") + errors.append( + f"{label} '{value}' does not equal required vocab value: '{options}' (using: '{vocab_lookup}')" + ) return errors - vocabs = Vocabs() diff --git a/checksit/describer.py b/checksit/describer.py index c70d540d..32d0b481 100644 --- a/checksit/describer.py +++ b/checksit/describer.py @@ -17,7 +17,6 @@ def describe(check_ids=None, verbose=False): print("Functional check descriptions:") for check_id, check_func in check_funcs: - + print(f"\n{check_id}:\n\tFunction: {check_func.__name__}\n\tDescription:") print("\n\t".join([line for line in check_func.__doc__.split("\n")])) - diff --git a/checksit/generic.py b/checksit/generic.py index 54f2fda3..8679c57b 100644 --- a/checksit/generic.py +++ b/checksit/generic.py @@ -8,12 +8,22 @@ # date formate regex # could be yyyy, yyyymm, yyyymmdd, yyyymmdd-HH, yyyymmdd-HHMM, yyyymmdd-HHMMSS -date_regex = re.compile(r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$") +DATE_REGEX = re.compile( + r"^\d{4}$|^\d{6}$|^\d{8}$|^\d{8}-\d{2}$|^\d{8}-\d{4}$|^\d{8}-\d{6}$" +) + def _get_bounds_var_ids(dct): - return [var_id for var_id in dct["variables"] if ( - var_id.startswith("bounds_") or var_id.startswith("bnds_") or - var_id.endswith("_bounds") or var_id.endswith("_bnds"))] + return [ + var_id + for var_id in dct["variables"] + if ( + var_id.startswith("bounds_") + or var_id.startswith("bnds_") + or var_id.endswith("_bounds") + or var_id.endswith("_bnds") + ) + ] def one_spelling_mistake(word): @@ -21,20 +31,26 @@ def one_spelling_mistake(word): All edits that are one edit away from `word`. Adapted from https://norvig.com/spell-correct.html """ - letters = 'abcdefghijklmnopqrstuvwxyz0123456789._-' - splits = [(word[:i], word[i:]) for i in range(1,len(word) + 1)] # 1 in range requires first letter to be correct - deletes = [L + R[1:] for L, R in splits if R] - transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] - replaces = [L + c + R[1:] for L, R in splits if R for c in letters] - inserts = [L + c + R for L, R in splits for c in letters] + letters = "abcdefghijklmnopqrstuvwxyz0123456789._-" + splits = [ + (word[:i], word[i:]) for i in range(1, len(word) + 1) + ] # 1 in range requires first letter to be correct + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) + def two_spelling_mistakes(word): """ All edits that are two edits away from `word`. From https://norvig.com/spell-correct.html """ - return set([ e2 for e1 in one_spelling_mistake(word) for e2 in one_spelling_mistake(e1) ]) + return set( + [e2 for e1 in one_spelling_mistake(word) for e2 in one_spelling_mistake(e1)] + ) + def search_close_match(search_for, search_in): possible_close_edits = two_spelling_mistakes(search_for.lower()) @@ -56,16 +72,26 @@ def check_var_attrs(dct, defined_attrs, ignore_bounds=True, skip_spellcheck=Fals bounds_vars = _get_bounds_var_ids(dct) for var_id, var_dict in dct["variables"].items(): - if var_id in bounds_vars: continue + if var_id in bounds_vars: + continue for attr in defined_attrs: if is_undefined(var_dict.get(attr)): - errors.append(f"[variable**************:{var_id}]: Attribute '{attr}' must have a valid definition.") + errors.append( + f"[variable**************:{var_id}]: Attribute '{attr}' must have a valid definition." + ) return errors, warnings -def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=None, rules_attrs=None, skip_spellcheck=False): +def check_global_attrs( + dct, + defined_attrs=None, + vocab_attrs=None, + regex_attrs=None, + rules_attrs=None, + skip_spellcheck=False, +): """ Check that required global attributes are correct. @@ -81,53 +107,71 @@ def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=No warnings = [] for attr in defined_attrs: - if attr not in dct['global_attributes']: + if attr not in dct["global_attributes"]: errors.append( f"[global-attributes:**************:{attr}]: Attribute '{attr}' does not exist. " f"{search_close_match(attr, dct['global_attributes'].keys()) if not skip_spellcheck else ''}" ) - elif is_undefined(dct['global_attributes'].get(attr)): - errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.") + elif is_undefined(dct["global_attributes"].get(attr)): + errors.append( + f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'." + ) for attr in vocab_attrs: - if attr not in dct['global_attributes']: + if attr not in dct["global_attributes"]: errors.append( f"[global-attributes:**************:{attr}]: Attribute '{attr}' does not exist. " f"{search_close_match(attr, dct['global_attributes'].keys()) if not skip_spellcheck else ''}" ) - elif is_undefined(dct['global_attributes'].get(attr)): - errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.") + elif is_undefined(dct["global_attributes"].get(attr)): + errors.append( + f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'." + ) else: - errors.extend(vocabs.check(vocab_attrs[attr], dct["global_attributes"].get(attr), label=f"[global-attributes:******:{attr}]***")) + errors.extend( + vocabs.check( + vocab_attrs[attr], + dct["global_attributes"].get(attr), + label=f"[global-attributes:******:{attr}]***", + ) + ) for attr in regex_attrs: - if attr not in dct['global_attributes']: + if attr not in dct["global_attributes"]: errors.append( f"[global-attributes:**************:{attr}]: Attribute '{attr}' does not exist. " f"{search_close_match(attr, dct['global_attributes'].keys()) if not skip_spellcheck else ''}" ) - elif is_undefined(dct['global_attributes'].get(attr)): - errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.") - elif not re.match(regex_attrs[attr], dct['global_attributes'].get(attr)): + elif is_undefined(dct["global_attributes"].get(attr)): + errors.append( + f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'." + ) + elif not re.match(regex_attrs[attr], dct["global_attributes"].get(attr)): errors.append( f"[global-attributes:******:{attr}]: '{dct['global_attributes'].get(attr, UNDEFINED)}' " f"does not match regex pattern '{regex_attrs[attr]}'." ) for attr in rules_attrs: - if attr not in dct['global_attributes']: + if attr not in dct["global_attributes"]: errors.append( f"[global-attributes:**************:{attr}]: Attribute '{attr}' does not exist. " f"{search_close_match(attr, dct['global_attributes'].keys()) if not skip_spellcheck else ''}" ) - elif is_undefined(dct['global_attributes'].get(attr)): - errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.") + elif is_undefined(dct["global_attributes"].get(attr)): + errors.append( + f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'." + ) else: - rules_check_output = rules.check(rules_attrs[attr], dct['global_attributes'].get(attr), context=dct['inpt'], label=f"[global-attributes:******:{attr}]***") + rules_check_output = rules.check( + rules_attrs[attr], + dct["global_attributes"].get(attr), + context=dct["inpt"], + label=f"[global-attributes:******:{attr}]***", + ) warnings.extend(rules_check_output[1]) errors.extend(rules_check_output[0]) - return errors, warnings @@ -141,8 +185,8 @@ def check_var_exists(dct, variables, skip_spellcheck=False): warnings = [] for var in variables: - if ':__OPTIONAL__' in var: - var = var.split(':')[0] + if ":__OPTIONAL__" in var: + var = var.split(":")[0] if var not in dct["variables"].keys(): warnings.append( f"[variable**************:{var}]: Optional variable does not exist in file. " @@ -168,8 +212,8 @@ def check_dim_exists(dct, dimensions, skip_spellcheck=False): warnings = [] for dim in dimensions: - if ':__OPTIONAL__' in dim: - dim = dim.split(':')[0] + if ":__OPTIONAL__" in dim: + dim = dim.split(":")[0] if dim not in dct["dimensions"].keys(): warnings.append( f"[dimension**************:{dim}]: Optional dimension does not exist in file. " @@ -225,8 +269,8 @@ def check_var(dct, variable, defined_attrs, rules_attrs=None, skip_spellcheck=Fa if isinstance(variable, list): variable = variable[0] - if ':__OPTIONAL__' in variable: - variable = variable.split(':')[0] + if ":__OPTIONAL__" in variable: + variable = variable.split(":")[0] if variable not in dct["variables"].keys(): warnings.append( f"[variable**************:{variable}]: Optional variable does not exist in file. " @@ -235,30 +279,35 @@ def check_var(dct, variable, defined_attrs, rules_attrs=None, skip_spellcheck=Fa else: for attr in defined_attrs: if isinstance(attr, dict) and len(attr.keys()) == 1: - for key,value in attr.items(): - attr = f'{key}: {value}' - attr_key = attr.split(':')[0] - attr_value = ':'.join(attr.split(':')[1:]) + for key, value in attr.items(): + attr = f"{key}: {value}" + attr_key = attr.split(":")[0] + attr_value = ":".join(attr.split(":")[1:]) if attr_key not in dct["variables"][variable]: errors.append( f"[variable**************:{variable}]: Attribute '{attr_key}' does not exist. " f"{search_close_match(attr_key, dct['variables'][variable]) if not skip_spellcheck else ''}" ) - elif '' in attr_value: + elif "" in attr_value: # work this out pass - elif attr_key == 'flag_values': - attr_value = attr_value.strip(',') - attr_value = [ int(i.strip('b')) for i in attr_value.split(',') ] + elif attr_key == "flag_values": + attr_value = attr_value.strip(",") + attr_value = [int(i.strip("b")) for i in attr_value.split(",")] attr_value = np.array(attr_value, dtype=np.int8) - if not ((len(dct["variables"][variable].get(attr_key)) == len(attr_value)) and np.all(dct["variables"][variable].get(attr_key) == attr_value)): + if not ( + ( + len(dct["variables"][variable].get(attr_key)) + == len(attr_value) + ) + and np.all( + dct["variables"][variable].get(attr_key) == attr_value + ) + ): errors.append( f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition '{attr_value}', " f"not '{dct['variables'][variable].get(attr_key)}'." ) - #elif attr_key == 'flag_meanings': - # print(attr_value) - # print(dct["variables"][variable].get(attr_key)) elif not str(dct["variables"][variable].get(attr_key)) == attr_value: errors.append( f"[variable**************:{variable}]: Attribute '{attr_key}' must have definition '{attr_value}', " @@ -307,7 +356,6 @@ def check_var(dct, variable, defined_attrs, rules_attrs=None, skip_spellcheck=Fa warnings.extend(rule_warnings) - else: if variable not in dct["variables"].keys(): errors.append( @@ -316,14 +364,14 @@ def check_var(dct, variable, defined_attrs, rules_attrs=None, skip_spellcheck=Fa ) else: for attr in defined_attrs: - attr_key = attr.split(':')[0] - attr_value = ':'.join(attr.split(':')[1:]) + attr_key = attr.split(":")[0] + attr_value = ":".join(attr.split(":")[1:]) if attr_key not in dct["variables"][variable]: errors.append( f"[variable**************:{variable}]: Attribute '{attr_key}' does not exist. " f"{search_close_match(attr_key, dct['variables'][variable]) if not skip_spellcheck else ''}" ) - elif '<' in attr_value: + elif "<" in attr_value: # work this out pass elif not dct["variables"][variable].get(attr_key) == attr_value: @@ -390,16 +438,29 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): # check instrument name if "instrument" in vocab_checks.keys(): - if vocabs.check(vocab_checks["instrument"], file_name_parts[0], label="_") != []: - errors.append(f"[file name]: Invalid file name format - unknown instrument '{file_name_parts[0]}'") + if ( + vocabs.check(vocab_checks["instrument"], file_name_parts[0], label="_") + != [] + ): + errors.append( + f"[file name]: Invalid file name format - unknown instrument '{file_name_parts[0]}'" + ) else: msg = "No instrument vocab defined in specs" raise KeyError(msg) # check platform if "platform" in rule_checks.keys(): - if rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -") != ([], []): - rule_errors, rule_warnings = rules.check(rule_checks["platform"], file_name_parts[1], label="[file name]: Invalid file name format -") + if rules.check( + rule_checks["platform"], + file_name_parts[1], + label="[file name]: Invalid file name format -", + ) != ([], []): + rule_errors, rule_warnings = rules.check( + rule_checks["platform"], + file_name_parts[1], + label="[file name]: Invalid file name format -", + ) if rule_errors != []: errors.extend(rule_errors) if rule_warnings != []: @@ -411,8 +472,10 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): # check date format # could be yyyy, yyyymm, yyyymmdd, yyyymmdd-HH, yyyymmdd-HHMM, yyyymmdd-HHMMSS # first checks format, then date validity - if not date_regex.match(file_name_parts[2]): - errors.append(f"[file name]: Invalid file name format - bad date format '{file_name_parts[2]}'") + if not DATE_REGEX.match(file_name_parts[2]): + errors.append( + f"[file name]: Invalid file name format - bad date format '{file_name_parts[2]}'" + ) else: fmts = ("%Y", "%Y%m", "%Y%m%d", "%Y%m%d-%H", "%Y%m%d-%H%M", "%Y%m%d-%H%M%S") valid_date_found = False @@ -424,12 +487,19 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): except ValueError: pass if not valid_date_found: - errors.append(f"[file name]: Invalid file name format - invalid date in file name '{file_name_parts[2]}'") + errors.append( + f"[file name]: Invalid file name format - invalid date in file name '{file_name_parts[2]}'" + ) # check data product if "data_product" in vocab_checks.keys(): - if vocabs.check(vocab_checks["data_product"], file_name_parts[3], label="_") != []: - errors.append(f"[file name]: Invalid file name format - unknown data product '{file_name_parts[3]}'") + if ( + vocabs.check(vocab_checks["data_product"], file_name_parts[3], label="_") + != [] + ): + errors.append( + f"[file name]: Invalid file name format - unknown data product '{file_name_parts[3]}'" + ) else: msg = "No data product vocab defined in specs" raise KeyError(msg) @@ -437,11 +507,15 @@ def check_file_name(file_name, vocab_checks=None, rule_checks=None, **kwargs): # check version number format version_component = file_name_parts[-1].split(".nc")[0] if not re.match(r"^v\d.\d$", version_component): - errors.append(f"[file name]: Invalid file name format - incorrect file version number '{version_component}'") + errors.append( + f"[file name]: Invalid file name format - incorrect file version number '{version_component}'" + ) # check number of options - max length of splitted file name if len(file_name_parts) > 8: - errors.append(f"[file name]: Invalid file name format - too many options in file name") + errors.append( + f"[file name]: Invalid file name format - too many options in file name" + ) return errors, warnings diff --git a/checksit/make_specs.py b/checksit/make_specs.py index 3738af9c..30d2aeff 100644 --- a/checksit/make_specs.py +++ b/checksit/make_specs.py @@ -6,19 +6,20 @@ # USEFUL FUNCTIONS # #################### + def map_data_type(dtype): data_map = { - 'float64':'double', - 'float32':'float', - 'int32':'int', - 'byte':'byte', + "float64": "double", + "float32": "float", + "int32": "int", + "byte": "byte", } return data_map[dtype] - # main function + def make_amof_specs(version_number): ############### # DIRECTORIES # @@ -27,22 +28,21 @@ def make_amof_specs(version_number): cvs_dir = f"vocabs/AMF_CVs/{version_number}" out_dir = f"../specs/groups/ncas-amof-{version_number}" - ################ # GLOBAL ATTRS # ################ - with open(f'{cvs_dir}/AMF_product_common_global-attributes_land.json') as f: - data = json.load(f)['product_common_global-attributes_land'] + with open(f"{cvs_dir}/AMF_product_common_global-attributes_land.json") as f: + data = json.load(f)["product_common_global-attributes_land"] attr_rules = {} for attr in data.keys(): - compliance = data[attr]['compliance_checking_rules'] + compliance = data[attr]["compliance_checking_rules"] if compliance.lower() in ["exact match", "exact match of text to the left"]: rule = f"regex:{data[attr]['fixed_value']}" elif "String: min" in compliance: - number = compliance.split(' ')[2] + number = compliance.split(" ")[2] rule = f"rule-func:string-of-length:{number}+" elif compliance.lower() == "valid email": rule = "regex-rule:valid-email" @@ -51,25 +51,36 @@ def make_amof_specs(version_number): elif compliance.lower() == "valid url _or_ n/a": rule = "regex-rule:valid-url-or-na" elif "match: " in compliance.lower(): - if r'YYYY-MM-DDThh:mm:ss\.\d+ _or_ N/A' in compliance: + if r"YYYY-MM-DDThh:mm:ss\.\d+ _or_ N/A" in compliance: rule = "regex-rule:datetime-or-na" - elif 'vN.M' in compliance: + elif "vN.M" in compliance: rule = "regex-rule:match:vN.M" - elif r'YYYY-MM-DDThh:mm:ss\.\d+' in compliance: + elif r"YYYY-MM-DDThh:mm:ss\.\d+" in compliance: rule = "regex-rule:datetime" - elif ' m' in compliance: + elif " m" in compliance: rule = r"regex:^-?\d+\.?\d* m$" else: rule = f"regex-rule:EDIT:{compliance}" - elif compliance.lower() in ["number","integer","int","float","string","str"]: + elif compliance.lower() in [ + "number", + "integer", + "int", + "float", + "string", + "str", + ]: rule = f"type-rule:{compliance.lower()}" elif compliance.lower() == "exact match in vocabulary": # known vocab matches - if attr == 'source': - rule = (f"__vocabs__:AMF_CVs/{version_number}/AMF_ncas_instrument:" - "ncas_instrument:__all__:description") - elif attr == 'platform': - rule = f"__vocabs__:AMF_CVs/{version_number}/AMF_platform:platform:__all__" + if attr == "source": + rule = ( + f"__vocabs__:AMF_CVs/{version_number}/AMF_ncas_instrument:" + "ncas_instrument:__all__:description" + ) + elif attr == "platform": + rule = ( + f"__vocabs__:AMF_CVs/{version_number}/AMF_platform:platform:__all__" + ) else: # a few extra catches if attr == "institution": @@ -77,126 +88,137 @@ def make_amof_specs(version_number): elif attr == "platform_type": rule = "rule-func:match-one-of:stationary_platform|moving_platform" elif attr == "featureType": - rule = "rule-func:match-one-of:timeSeries|timeSeriesProfile|trajectory" + rule = ( + "rule-func:match-one-of:timeSeries|timeSeriesProfile|trajectory" + ) else: rule = f"__vocabs__:EDIT:{compliance}" elif "one of: " in compliance.lower(): - options = compliance.split(': ')[1] - options = options.replace(',','|') - while ' ' in options: - options = options.replace(' ','') + options = compliance.split(": ")[1] + options = options.replace(",", "|") + while " " in options: + options = options.replace(" ", "") rule = f"rule-func:match-one-of:{options}" else: rule = f"UNKNOWN compliance: {compliance}" - rule = rule.replace('(',r'\(') - rule = rule.replace(')',r'\)') - rule = [ rule.replace(' ',r'\s') if "regex:" in rule else rule ][0] + rule = rule.replace("(", r"\(") + rule = rule.replace(")", r"\)") + rule = [rule.replace(" ", r"\s") if "regex:" in rule else rule][0] attr_rules[attr] = rule - - with open(f'{out_dir}/amof-global-attrs.yml', 'w') as f: - f.write(('required-global-attrs:\n func: checksit.generic.check_global_attrs\n' - ' params:\n vocab_attrs:\n')) + with open(f"{out_dir}/amof-global-attrs.yml", "w") as f: + f.write( + ( + "required-global-attrs:\n func: checksit.generic.check_global_attrs\n" + " params:\n vocab_attrs:\n" + ) + ) for attr, rule in attr_rules.items(): if "__vocabs__" in rule: - f.write(f' {attr}: {rule}\n') - f.write(' rules_attrs:\n') + f.write(f" {attr}: {rule}\n") + f.write(" rules_attrs:\n") for attr, rule in attr_rules.items(): - if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']: - f.write(f' {attr}: {rule}\n') + if rule.split(":")[0] in ["regex", "regex-rule", "type-rule", "rule-func"]: + f.write(f" {attr}: {rule}\n") #################### # DEPLOYMENT MODES # #################### - deployment_modes = ['land','sea','air','trajectory'] + deployment_modes = ["land", "sea", "air", "trajectory"] for mode in deployment_modes: - with open(f'{cvs_dir}/AMF_product_common_dimension_{mode}.json') as f: - deploy_dims = json.load(f)[f'product_common_dimension_{mode}'].keys() - with open(f'{cvs_dir}/AMF_product_common_variable_{mode}.json') as f: - data = json.load(f)[f'product_common_variable_{mode}'] - #deploy_vars_attrs = {} + with open(f"{cvs_dir}/AMF_product_common_dimension_{mode}.json") as f: + deploy_dims = json.load(f)[f"product_common_dimension_{mode}"].keys() + with open(f"{cvs_dir}/AMF_product_common_variable_{mode}.json") as f: + data = json.load(f)[f"product_common_variable_{mode}"] + # deploy_vars_attrs = {} deploy_vars = {} for variable in data.keys(): deploy_vars[variable] = [] for attr in data[variable].keys(): attr_value = data[variable][attr] - if attr == 'type': + if attr == "type": attr_value = map_data_type(attr_value) - deploy_vars[variable].append(f'{attr}:{attr_value}') - + deploy_vars[variable].append(f"{attr}:{attr_value}") - - spec_file_name = f'{out_dir}/amof-common-{mode}.yml' - with open(spec_file_name, 'w') as f: + spec_file_name = f"{out_dir}/amof-common-{mode}.yml" + with open(spec_file_name, "w") as f: # variables for i, var in enumerate(deploy_vars.items()): - f.write(f'var-requires{i}:\n') - f.write((' func: checksit.generic.check_var\n params:\n variable:\n' - f' - {var[0]}\n defined_attrs:\n')) + f.write(f"var-requires{i}:\n") + f.write( + ( + " func: checksit.generic.check_var\n params:\n variable:\n" + f" - {var[0]}\n defined_attrs:\n" + ) + ) for attr in var[1]: - attr_key = attr.split(':')[0] - attr_value = ':'.join(attr.split(':')[1:]) - f.write(f' - {attr_key}:{attr_value}\n') - f.write('\n') + attr_key = attr.split(":")[0] + attr_value = ":".join(attr.split(":")[1:]) + f.write(f" - {attr_key}:{attr_value}\n") + f.write("\n") # dimensions - f.write(('dims-requires:\n func: checksit.generic.check_dim_exists\n' - ' params:\n dimensions:\n')) + f.write( + ( + "dims-requires:\n func: checksit.generic.check_dim_exists\n" + " params:\n dimensions:\n" + ) + ) for dim in deploy_dims: - f.write(f' - {dim}\n') + f.write(f" - {dim}\n") ############## ## PRODUCTS ## ############## # load all products - with open(f'{cvs_dir}/AMF_product.json') as f: - products = json.load(f)['product'] - + with open(f"{cvs_dir}/AMF_product.json") as f: + products = json.load(f)["product"] # go through each product, create spec file for product in products: - product = product.replace(' ','') + product = product.replace(" ", "") - if exists(f'{cvs_dir}/AMF_product_{product}_variable.json'): - with open(f'{cvs_dir}/AMF_product_{product}_variable.json') as f: - data = json.load(f)[f'product_{product}_variable'] + if exists(f"{cvs_dir}/AMF_product_{product}_variable.json"): + with open(f"{cvs_dir}/AMF_product_{product}_variable.json") as f: + data = json.load(f)[f"product_{product}_variable"] product_info = {} for variable in data.keys(): product_info[variable] = [] for attr in data[variable].keys(): attr_value = data[variable][attr] - if attr == 'flag_meanings': - attr_value = attr_value.replace('|',' ').replace(' ',' ') - elif attr == 'type': + if attr == "flag_meanings": + attr_value = attr_value.replace("|", " ").replace(" ", " ") + elif attr == "type": attr_value = map_data_type(attr_value) - product_info[variable].append(f'{attr}:{attr_value}') + product_info[variable].append(f"{attr}:{attr_value}") prod_vars_exist = True else: prod_vars_exist = False - if exists(f'{cvs_dir}/AMF_product_{product}_dimension.json'): - with open(f'{cvs_dir}/AMF_product_{product}_dimension.json') as f: - product_dims = json.load(f)[f'product_{product}_dimension'].keys() + if exists(f"{cvs_dir}/AMF_product_{product}_dimension.json"): + with open(f"{cvs_dir}/AMF_product_{product}_dimension.json") as f: + product_dims = json.load(f)[f"product_{product}_dimension"].keys() prod_dims_exist = True else: prod_dims_exist = False - - if exists(f'{cvs_dir}/AMF_product_{product}_global-attributes.json'): - with open(f'{cvs_dir}/AMF_product_{product}_global-attributes.json') as f: - data = json.load(f)[f'product_{product}_global-attributes'] + if exists(f"{cvs_dir}/AMF_product_{product}_global-attributes.json"): + with open(f"{cvs_dir}/AMF_product_{product}_global-attributes.json") as f: + data = json.load(f)[f"product_{product}_global-attributes"] attr_rules = {} for attr in data.keys(): - compliance = data[attr]['compliance_checking_rules'] - if compliance.lower() in ["exact match", - "exact match of text to the left"]: + compliance = data[attr]["compliance_checking_rules"] + if compliance.lower() in [ + "exact match", + "exact match of text to the left", + ]: rule = f"regex:{data[attr]['fixed_value']}" elif "String: min" in compliance: - number = compliance.split(' ')[2] + number = compliance.split(" ")[2] rule = f"rule-func:string-of-length:{number}+" elif compliance.lower() == "valid email": rule = "regex-rule:valid-email" @@ -205,81 +227,105 @@ def make_amof_specs(version_number): elif compliance.lower() == "valid url _or_ n/a": rule = "regex-rule:valid-url-or-na" elif "match: " in compliance.lower(): - if r'YYYY-MM-DDThh:mm:ss\.\d+ _or_ N/A' in compliance: + if r"YYYY-MM-DDThh:mm:ss\.\d+ _or_ N/A" in compliance: rule = "regex-rule:datetime-or-na" - elif 'vN.M' in compliance: + elif "vN.M" in compliance: rule = "regex-rule:match:vN.M" - elif r'YYYY-MM-DDThh:mm:ss\.\d+' in compliance: + elif r"YYYY-MM-DDThh:mm:ss\.\d+" in compliance: rule = "regex-rule:datetime" else: rule = f"regex-rule:EDIT:{compliance}" - elif compliance.lower() in ["number","integer", - "int","float","string","str"]: + elif compliance.lower() in [ + "number", + "integer", + "int", + "float", + "string", + "str", + ]: rule = f"type-rule:{compliance.lower()}" elif compliance.lower() == "exact match in vocabulary": - if attr == 'source': - rule = ("__vocabs__:AMF_CVs/AMF_ncas_instrument:" - "ncas_instrument:__all__:description") - elif attr == 'platform': + if attr == "source": + rule = ( + "__vocabs__:AMF_CVs/AMF_ncas_instrument:" + "ncas_instrument:__all__:description" + ) + elif attr == "platform": rule = "__vocabs__:AMF_CVs/AMF_platform:platform:__all__" else: rule = f"__vocabs__:EDIT:{compliance}" elif "one of: " in compliance.lower(): - options = compliance.split(': ')[1] - options = options.replace(',','|') - while ' ' in options: - options = options.replace(' ','') + options = compliance.split(": ")[1] + options = options.replace(",", "|") + while " " in options: + options = options.replace(" ", "") rule = f"rule-func:match-one-of:{options}" else: rule = f"UNKNOWN compliance: {compliance}" - rule = rule.replace('(',r'\(') - rule = rule.replace(')',r'\)') - rule = [ rule.replace(' ',r'\s') if "regex:" in rule else rule ][0] + rule = rule.replace("(", r"\(") + rule = rule.replace(")", r"\)") + rule = [rule.replace(" ", r"\s") if "regex:" in rule else rule][0] attr_rules[attr] = rule prod_attrs_exist = True else: prod_attrs_exist = False - - - spec_file_name = f'{out_dir}/amof-{product}.yml' - with open(spec_file_name, 'w') as f: + spec_file_name = f"{out_dir}/amof-{product}.yml" + with open(spec_file_name, "w") as f: if prod_vars_exist: for i, var in enumerate(product_info.items()): qc_flags = False - f.write(f'var-requires{i}:\n') - f.write((' func: checksit.generic.check_var\n' - ' params:\n variable:\n' - f' - {var[0]}:__OPTIONAL__\n defined_attrs:\n')) + f.write(f"var-requires{i}:\n") + f.write( + ( + " func: checksit.generic.check_var\n" + " params:\n variable:\n" + f" - {var[0]}:__OPTIONAL__\n defined_attrs:\n" + ) + ) for attr in var[1]: - attr_key = attr.split(':')[0] - attr_value = ':'.join(attr.split(':')[1:]) + attr_key = attr.split(":")[0] + attr_value = ":".join(attr.split(":")[1:]) if attr_key not in ["flag_values", "flag_meanings"]: - f.write(f' - {attr_key}:{attr_value}\n') + f.write(f" - {attr_key}:{attr_value}\n") else: qc_flags = True if qc_flags: - f.write(f' attr_rules:\n - rule-func:check-qc-flags\n') + f.write(f" attr_rules:\n - rule-func:check-qc-flags\n") if prod_dims_exist: - f.write(('dims-requires:\n func: checksit.generic.check_dim_exists\n' - ' params:\n dimensions:\n')) + f.write( + ( + "dims-requires:\n func: checksit.generic.check_dim_exists\n" + " params:\n dimensions:\n" + ) + ) for dim in product_dims: - f.write(f' - {dim}:__OPTIONAL__\n') + f.write(f" - {dim}:__OPTIONAL__\n") if prod_attrs_exist: - f.write(('\nrequired-global-attrs:\n func:' - ' checksit.generic.check_global_attrs\n' - ' params:\n vocab_attrs:\n')) + f.write( + ( + "\nrequired-global-attrs:\n func:" + " checksit.generic.check_global_attrs\n" + " params:\n vocab_attrs:\n" + ) + ) for attr, rule in attr_rules.items(): if "__vocabs__" in rule: - f.write(f' {attr}: {rule}\n') - f.write(' rules_attrs:\n') + f.write(f" {attr}: {rule}\n") + f.write(" rules_attrs:\n") for attr, rule in attr_rules.items(): - if rule.split(':')[0] in ['regex','regex-rule','type-rule','rule-func']: - f.write(f' {attr}: {rule}\n') + if rule.split(":")[0] in [ + "regex", + "regex-rule", + "type-rule", + "rule-func", + ]: + f.write(f" {attr}: {rule}\n") if __name__ == "__main__": import sys + version_number = sys.argv[1] make_amof_specs(version_number) diff --git a/checksit/readers/cdl.py b/checksit/readers/cdl.py index 6f5541a0..53c0ac45 100644 --- a/checksit/readers/cdl.py +++ b/checksit/readers/cdl.py @@ -6,9 +6,10 @@ from ..cvs import vocabs, vocabs_prefix + def get_output(cmd): subp = sp.Popen(cmd, shell=True, stdout=sp.PIPE) - return subp.stdout.read().decode('utf-8') + return subp.stdout.read().decode("utf-8") class CDLParser: @@ -22,7 +23,8 @@ def __init__(self, inpt, verbose=False): self._check_format() def _parse(self, inpt): - if self.verbose: print(f"[INFO] Parsing input: {inpt[:100]}...") + if self.verbose: + print(f"[INFO] Parsing input: {inpt[:100]}...") if inpt.endswith(".nc"): self.cdl = get_output(f"ncdump -h {inpt}") elif inpt.endswith(".cdl"): @@ -41,10 +43,14 @@ def _parse(self, inpt): for s in self.CDL_SPLITTERS: if s not in cdl_lines: - print(f"Please check your command - invalid file or CDL contents provided: '{inpt[:100]}...'") + print( + f"Please check your command - invalid file or CDL contents provided: '{inpt[:100]}...'" + ) sys.exit(1) - sections = self._get_sections(cdl_lines, split_patterns=self.CDL_SPLITTERS, start_at=1) + sections = self._get_sections( + cdl_lines, split_patterns=self.CDL_SPLITTERS, start_at=1 + ) # Re-split section 1 to separate variables from global attrs self.dimensions = self._ordered_dict(sections[0]) @@ -57,7 +63,9 @@ def _check_format(self): min_chars = 10 if len(source) < min_chars: - self.fmt_errors.append(f"[FORMAT:global_attributes:source] Must be at least {min_chars} characters, not {source}") + self.fmt_errors.append( + f"[FORMAT:global_attributes:source] Must be at least {min_chars} characters, not {source}" + ) def _get_sections(self, lines, split_patterns, start_at): split_patterns = deque(split_patterns) @@ -67,7 +75,8 @@ def _get_sections(self, lines, split_patterns, start_at): current = [] for i, line in enumerate(lines): - if i < start_at or not line.strip(): continue + if i < start_at or not line.strip(): + continue if line.startswith(splitter): if current: @@ -78,7 +87,9 @@ def _get_sections(self, lines, split_patterns, start_at): if split_patterns: splitter = split_patterns.popleft() else: - line_no_comments = re.split(r";\s+//.*$", line)[0].strip().rstrip(";").strip() + line_no_comments = ( + re.split(r";\s+//.*$", line)[0].strip().rstrip(";").strip() + ) if not line_no_comments.startswith("//"): current.append(line_no_comments) @@ -87,14 +98,16 @@ def _get_sections(self, lines, split_patterns, start_at): def _split_vars_globals(self, content): variables = [] for i, line in enumerate(content): - if line.startswith(":"): break + if line.startswith(":"): + break variables.append(line) global_attrs = content[i:] return self._construct_variables(variables), self._ordered_dict(global_attrs) def _parse_var_dtype_dims(self, line): - if self.verbose: print(f"PARSING LINE: {line}") + if self.verbose: + print(f"PARSING LINE: {line}") dtype, var_info = line.strip().split(" ", 1) var_id = var_info.split("(")[0] dim_info = line.replace(f"{dtype} {var_id}", "").strip() @@ -109,7 +122,11 @@ def _safe_parse_value(self, value): return eval(value) except: # Remove datatype suffixes and parse as list if commas are in value - return eval(", ".join([part.strip().rstrip("bBcCfFiIlLsS") for part in value.split(",")])) + return eval( + ", ".join( + [part.strip().rstrip("bBcCfFiIlLsS") for part in value.split(",")] + ) + ) def _construct_variables(self, content): variables = {} @@ -125,7 +142,11 @@ def _construct_variables(self, content): vocab_var_id = line.split(":")[3] vocab_lookup = line.split(":", 1)[-1] variables[vocab_var_id] = vocabs.lookup(vocab_lookup) - elif not var_id or not line.startswith(f"{var_id}:") and last_line.strip()[-1] != ",": + elif ( + not var_id + or not line.startswith(f"{var_id}:") + and last_line.strip()[-1] != "," + ): # Add current collected variable to list if it exists if current: variables[var_id] = current.copy() @@ -137,9 +158,11 @@ def _construct_variables(self, content): dimensions = ", ".join(dimensions) current = {"type": dtype, "dimension": dimensions} else: -# key, value = [x.strip() for x in line.split(":", 1)[1].split("=", 1)] + # key, value = [x.strip() for x in line.split(":", 1)[1].split("=", 1)] # Send last key and last value (from last iteration of loop) and line to get new value - key, value = self._parse_key_value_multiline_safe(line, key, value, variable_attr=True) + key, value = self._parse_key_value_multiline_safe( + line, key, value, variable_attr=True + ) current[key] = self._safe_parse_value(value) last_line = line @@ -148,13 +171,15 @@ def _construct_variables(self, content): return variables - def _parse_key_value_multiline_safe(self, line, last_key, last_value, variable_attr=False): + def _parse_key_value_multiline_safe( + self, line, last_key, last_value, variable_attr=False + ): # Caters for continuation lines for arrays of strings, etc if "=" in line: # A new (key, value) pair is found - if variable_attr: # var attr + if variable_attr: # var attr key, value = [x.strip() for x in line.split(":", 1)[1].split("=", 1)] - else: # global attr + else: # global attr key, value = [x.strip() for x in line.lstrip(":").split("=", 1)] else: # Assume a continuation of th last value, so set key to None @@ -162,22 +187,22 @@ def _parse_key_value_multiline_safe(self, line, last_key, last_value, variable_a return key, value - def _ordered_dict(self, content): resp = {} key = None value = None for line in content: - if self.verbose: print(f"WORKING ON LINE: {line}") + if self.verbose: + print(f"WORKING ON LINE: {line}") # Cater for continuation lines for arrays of strings, etc -# if "=" in line: - # A new (key, value) pair is found -# key, value = [x.strip() for x in line.lstrip(":").split("=", 1)] -# else: - # Assume a continuation of th last value -# value += " " + line.strip() + # if "=" in line: + # A new (key, value) pair is found + # key, value = [x.strip() for x in line.lstrip(":").split("=", 1)] + # else: + # Assume a continuation of th last value + # value += " " + line.strip() # Send last key and last value (from last iteration of loop) and line to get new value key, value = self._parse_key_value_multiline_safe(line, key, value) @@ -188,14 +213,20 @@ def _ordered_dict(self, content): return resp def to_yaml(self): - return yaml.dump(self.to_dict(), Dumper=yaml.SafeDumper, - default_flow_style=False, sort_keys=False) + return yaml.dump( + self.to_dict(), + Dumper=yaml.SafeDumper, + default_flow_style=False, + sort_keys=False, + ) def to_dict(self): - return {"dimensions": self.dimensions, - "variables": self.variables, - "global_attributes": self.global_attrs, - "inpt": self.inpt} + return { + "dimensions": self.dimensions, + "variables": self.variables, + "global_attributes": self.global_attrs, + "inpt": self.inpt, + } def read(fpath, verbose=False): diff --git a/checksit/readers/image.py b/checksit/readers/image.py index 5052347e..4e208973 100644 --- a/checksit/readers/image.py +++ b/checksit/readers/image.py @@ -1,6 +1,7 @@ import subprocess as sp import yaml + def get_output(cmd): subp = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) return subp.stdout.read().decode("charmap"), subp.stderr.read().decode("charmap") @@ -16,20 +17,22 @@ def __init__(self, inpt, verbose=False): self._parse(inpt) def _parse(self, inpt): - if self.verbose: print(f"[INFO] Parsing input: {inpt[:100]}...") + if self.verbose: + print(f"[INFO] Parsing input: {inpt[:100]}...") self.global_attrs = {} exiftool_arguments = self.base_exiftool_arguments + [inpt] exiftool_return_string = sp.check_output(exiftool_arguments) raw_global_attrs = yaml.load(exiftool_return_string, Loader=yaml.SafeLoader)[0] for tag_name in raw_global_attrs.keys(): - value_type = type(raw_global_attrs[tag_name]) - if value_type == list: - self.global_attrs[tag_name] = str(raw_global_attrs[tag_name][0]) - else: - self.global_attrs[tag_name] = str(raw_global_attrs[tag_name]) + value_type = type(raw_global_attrs[tag_name]) + if value_type == list: + self.global_attrs[tag_name] = str(raw_global_attrs[tag_name][0]) + else: + self.global_attrs[tag_name] = str(raw_global_attrs[tag_name]) def _find_exiftool(self): - if self.verbose: print("[INFO] Searching for exiftool...") + if self.verbose: + print("[INFO] Searching for exiftool...") which_output, which_error = get_output("which exiftool") if which_error.startswith("which: no exiftool in"): msg = ( @@ -39,15 +42,17 @@ def _find_exiftool(self): raise RuntimeError(msg) else: self.exiftool_location = which_output.strip() - if self.verbose: print(f"[INFO] Found exiftool at {self.exiftool_location}.") + if self.verbose: + print(f"[INFO] Found exiftool at {self.exiftool_location}.") - def _attrs_dict(self,content_lines): + def _attrs_dict(self, content_lines): attr_dict = {} for line in content_lines: - if self.verbose: print(f"WORKING ON LINE: {line}") - key_0 = line.split("=",1)[0].strip() - key = key_0[1:] #removes first character - unwanted quotation marks - value = line.split("=",1)[1].strip() + if self.verbose: + print(f"WORKING ON LINE: {line}") + key_0 = line.split("=", 1)[0].strip() + key = key_0[1:] # removes first character - unwanted quotation marks + value = line.split("=", 1)[1].strip() attr_dict[key] = value return attr_dict @@ -57,4 +62,3 @@ def to_dict(self): def read(fpath, verbose=False): return ImageParser(fpath, verbose=verbose) - diff --git a/checksit/rules/processors.py b/checksit/rules/processors.py index 389ebe19..3f3539a7 100644 --- a/checksit/rules/processors.py +++ b/checksit/rules/processors.py @@ -10,4 +10,4 @@ def uppercase(value): def no_extension(value): - return os.path.splitext(value)[0] \ No newline at end of file + return os.path.splitext(value)[0] diff --git a/checksit/rules/rule_funcs.py b/checksit/rules/rule_funcs.py index 57007f4d..f3560868 100644 --- a/checksit/rules/rule_funcs.py +++ b/checksit/rules/rule_funcs.py @@ -59,7 +59,10 @@ def match_one_or_more_of(value, context, extras=None, label=""): """ String value or list value must match one of more of list given in extras """ - def as_set(x, sep): return set([i.strip() for i in x.split(sep)]) + + def as_set(x, sep): + return set([i.strip() for i in x.split(sep)]) + options = as_set(extras[0], rule_splitter) values = as_set(value, ",") @@ -99,12 +102,14 @@ def validate_image_date_time(value, context, extras=None, label=""): for f in ["%Y:%m:%d %H:%M:%S", "%Y:%m:%d %H:%M:%S.%f"]: if match == False: try: - match = (value == datetime.strptime(value, f).strftime(f)) + match = value == datetime.strptime(value, f).strftime(f) except ValueError: pass if not match: - errors.append(f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s") + errors.append( + f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s" + ) return errors @@ -113,7 +118,7 @@ def validate_orcid_ID(value, context, extras=None, label=""): """ A function to verify the format of an orcid ID """ - orcid_string = "https://orcid.org/" # required format of start of the string + orcid_string = "https://orcid.org/" # required format of start of the string errors = [] @@ -122,23 +127,31 @@ def validate_orcid_ID(value, context, extras=None, label=""): # Check that total the length is correct if len(value) != 37: - errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX") + errors.append( + f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX" + ) # Check the start of the string (first 18 characters) - elif (value[0:18] != orcid_string or - + elif ( + value[0:18] != orcid_string + or # Check that the "-" are in the correct places - value[22] != "-" or - value[27] != "-" or - value[32] != "-" or - + value[22] != "-" + or value[27] != "-" + or value[32] != "-" + or # Check that the last characters contain only "-" and digits (plus 'X' for last digit) not ( - PI_orcid_digits_only.isdigit() or (PI_orcid_digits_only[0:15].isdigit() and PI_orcid_digits_only[15] == "X") + PI_orcid_digits_only.isdigit() + or ( + PI_orcid_digits_only[0:15].isdigit() and PI_orcid_digits_only[15] == "X" + ) ) ): - errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX") + errors.append( + f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX" + ) return errors @@ -147,23 +160,33 @@ def list_of_names(value, context, extras=None, label=""): """ A function to verify the names of people when a list of names may be provided """ - name_pattern = r'(.)+, (.)+ ?((.)+|((.)\.))' # The format names should be written in - character_name_pattern = r'[A-Za-z_À-ÿ\-\'\ \.\,]+' + name_pattern = ( + r"(.)+, (.)+ ?((.)+|((.)\.))" # The format names should be written in + ) + character_name_pattern = r"[A-Za-z_À-ÿ\-\'\ \.\,]+" warnings = [] if type(value) == list: for i in value: if not re.fullmatch(name_pattern, i): - warnings.append(f"{label} '{value}' should be of the format , or , where appropriate") + warnings.append( + f"{label} '{value}' should be of the format , or , where appropriate" + ) if not re.fullmatch(character_name_pattern, i): - warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate") + warnings.append( + f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate" + ) if type(value) == str: if not re.fullmatch(name_pattern, value): - warnings.append(f"{label} '{value}' should be of the format , or , where appropriate") + warnings.append( + f"{label} '{value}' should be of the format , or , where appropriate" + ) if not re.fullmatch(character_name_pattern, value): - warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate") + warnings.append( + f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate" + ) return warnings @@ -179,10 +202,14 @@ def headline(value, context, extras=None, label=""): else: if len(value) > 150: - warnings.append(f"{label} '{value}' should contain no more than one sentence") + warnings.append( + f"{label} '{value}' should contain no more than one sentence" + ) if value.count(".") >= 2: - warnings.append(f"{label} '{value}' should contain no more than one sentence") + warnings.append( + f"{label} '{value}' should contain no more than one sentence" + ) if not value[0].isupper(): warnings.append(f"{label} '{value}' should start with a capital letter") @@ -199,7 +226,7 @@ def title_check(value, context, extras=None, label=""): """ errors = [] - if value != os.path.basename(context) : + if value != os.path.basename(context): errors.append(f"{label} '{value}' must match the name of the file") return errors @@ -211,11 +238,12 @@ def url_checker(value, context, extras=None, label=""): """ warnings = [] - try: url=urlopen(value) + try: + url = urlopen(value) except: warnings.append(f"{label} '{value}' is not a reachable url") else: - if url.getcode() != 200: # (200 means it exists and is up and reachable) + if url.getcode() != 200: # (200 means it exists and is up and reachable) warnings.append(f"{label} '{value}' is not a reachable url") finally: return warnings @@ -230,9 +258,13 @@ def relation_url_checker(value, context, extras=None, label=""): if " " not in value: errors.append(f"{label} '{value}' should contain a space before the url") else: - relation_url = value.partition(" ")[2] # extract only the url part of the relation string + relation_url = value.partition(" ")[ + 2 + ] # extract only the url part of the relation string if url_checker(relation_url, context, extras, label) != []: - errors.extend(url_checker(relation_url, context, extras, label)) # check the url exists using the url_checker() function defined above + errors.extend( + url_checker(relation_url, context, extras, label) + ) # check the url exists using the url_checker() function defined above return errors @@ -243,7 +275,7 @@ def latitude(value, context, extras=None, label=""): """ errors = [] - latitude = re.findall(r'[0-9]+', value) + latitude = re.findall(r"[0-9]+", value) int_latitude = int(latitude[0]) dec_latitude = int(latitude[1]) @@ -259,7 +291,7 @@ def longitude(value, context, extras=None, label=""): """ errors = [] - longitude = re.findall(r'[0-9]+', value) + longitude = re.findall(r"[0-9]+", value) int_longitude = int(longitude[0]) dec_longitude = int(longitude[1]) @@ -274,14 +306,20 @@ def ceda_platform(value, context, extras=None, label=""): A function to check if the platform is in the CEDA catalogue API """ errors = [] - api_result = requests.get(f"http://api.catalogue.ceda.ac.uk/api/v2/identifiers.json/?url={value}") - if (len(api_result.json()['results']) == 1) and (api_result.json()['results'][0]['relatedTo']['short_code'] == "plat"): + api_result = requests.get( + f"http://api.catalogue.ceda.ac.uk/api/v2/identifiers.json/?url={value}" + ) + if (len(api_result.json()["results"]) == 1) and ( + api_result.json()["results"][0]["relatedTo"]["short_code"] == "plat" + ): legit_platform = True else: legit_platform = False if not legit_platform: - errors.append(f"{label} '{value}' is not a valid platform in the CEDA catalogue") + errors.append( + f"{label} '{value}' is not a valid platform in the CEDA catalogue" + ) return errors @@ -292,10 +330,14 @@ def ncas_platform(value, context, extras=None, label=""): """ errors = [] - latest_version = requests.get("https://github.com/ncasuk/ncas-data-platform-vocabs/releases/latest").url.split("/")[-1] + latest_version = requests.get( + "https://github.com/ncasuk/ncas-data-platform-vocabs/releases/latest" + ).url.split("/")[-1] - result = requests.get(f"https://raw.githubusercontent.com/ncasuk/ncas-data-platform-vocabs/{latest_version}/AMF_CVs/AMF_platform.json") - ncas_platforms = result.json()['platform'].keys() + result = requests.get( + f"https://raw.githubusercontent.com/ncasuk/ncas-data-platform-vocabs/{latest_version}/AMF_CVs/AMF_platform.json" + ) + ncas_platforms = result.json()["platform"].keys() if value not in ncas_platforms: errors.append(f"{label} '{value}' is not a valid NCAS platform") @@ -315,7 +357,9 @@ def check_qc_flags(value, context, extras=None, label=""): # check flag_values are correctly formatted (should be array of bytes) if not (isinstance(value, np.ndarray) or isinstance(value, tuple)): - errors.append(f"{label} QC flag_values must be an array or tuple of byte values, not '{type(value)}'.") + errors.append( + f"{label} QC flag_values must be an array or tuple of byte values, not '{type(value)}'." + ) # check there are at least two values and they start with 0 and 1 if not len(value) > 2: @@ -325,12 +369,18 @@ def check_qc_flags(value, context, extras=None, label=""): # check there are at least two meanings and the first two are correct if not len(meanings) > 2: - errors.append(f"{label} There must be at least two QC flag meanings (space separated).") + errors.append( + f"{label} There must be at least two QC flag meanings (space separated)." + ) elif not np.all(meanings[:2] == ["not_used", "good_data"]): - errors.append(f"{label} First two QC flag_meanings must be 'not_used' and 'good_data'.") + errors.append( + f"{label} First two QC flag_meanings must be 'not_used' and 'good_data'." + ) # check number of values is same as number of meanings if not len(value) == len(meanings): - errors.append(f"{label} Number of flag_values must equal number of flag_meanings.") + errors.append( + f"{label} Number of flag_values must equal number of flag_meanings." + ) return errors diff --git a/checksit/rules/rules.py b/checksit/rules/rules.py index f269e534..19106bf7 100644 --- a/checksit/rules/rules.py +++ b/checksit/rules/rules.py @@ -6,6 +6,7 @@ from . import rule_funcs from ..config import get_config + conf = get_config() rules_prefix = conf["settings"]["rules_prefix"] @@ -15,87 +16,82 @@ class Rules: def __init__(self): - _NOT_APPLICABLE_RULES = "(N/A)|(NA)|(N A)|(n/a)|(na)|(n a)|" \ - "(Not Applicable)|(Not applicable)|(Not available)|(Not Available)|" \ - "(not applicable)|(not available)" + _NOT_APPLICABLE_RULES = ( + "(N/A)|(NA)|(N A)|(n/a)|(na)|(n a)|" + "(Not Applicable)|(Not applicable)|(Not available)|(Not Available)|" + "(not applicable)|(not available)" + ) self.static_regex_rules = { - "integer": { - "regex-rule": r"-?\d+", - "example": "10" - }, + "integer": {"regex-rule": r"-?\d+", "example": "10"}, "valid-email": { "regex-rule": r"[^@\s]+@[^@\s]+\.[^\s@]+", - "example": "sam@example.com" + "example": "sam@example.com", }, "valid-url": { "regex-rule": r"https?://[^\s]+\.[^\s]*[^\s\.](/[^\s]+)?", - "example": "https://github.com" + "example": "https://github.com", }, "valid-url-or-na": { - "regex-rule": r"(https?://[^\s]+\.[^\s]*[^\s\.](/[^\s]+)?)|" + _NOT_APPLICABLE_RULES, - "example": "https://github.com" - }, - "match:vN.M": { - "regex-rule": r"v\d\.\d", - "example": "v1.0" + "regex-rule": r"(https?://[^\s]+\.[^\s]*[^\s\.](/[^\s]+)?)|" + + _NOT_APPLICABLE_RULES, + "example": "https://github.com", }, + "match:vN.M": {"regex-rule": r"v\d\.\d", "example": "v1.0"}, "datetime": { "regex-rule": r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?", - "example": "2023-11-17T15:00:00" + "example": "2023-11-17T15:00:00", }, "datetimeZ": { "regex-rule": r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z", "example": "2023-11-17T15:00:00Z" }, "datetime-or-na": { - "regex-rule": r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?)|" + _NOT_APPLICABLE_RULES, - "example": "2023-11-17T15:00:00" - }, - "number": { - "regex-rule": r"-?\d+(\.\d+)?", - "example": "10.5" + "regex-rule": r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?)|" + + _NOT_APPLICABLE_RULES, + "example": "2023-11-17T15:00:00", }, + "number": {"regex-rule": r"-?\d+(\.\d+)?", "example": "10.5"}, "location": { - "regex-rule": r'(.)+(\,\ )(.)+', - "example": "Chilbolton Atmospheric Observatory, Chilbolton, Hampshire, UK" + "regex-rule": r"(.)+(\,\ )(.)+", + "example": "Chilbolton Atmospheric Observatory, Chilbolton, Hampshire, UK", }, "latitude-image": { - "regex-rule": r'[\+|\-]?[0-9]{1,2}\.[0-9]{0,6}', - "example": "12.345678" + "regex-rule": r"[\+|\-]?[0-9]{1,2}\.[0-9]{0,6}", + "example": "12.345678", }, "longitude-image": { - "regex-rule": r'[\+|\-]?1?[0-9]{1,2}\.[0-9]{0,6}', - "example": "123.456789" + "regex-rule": r"[\+|\-]?1?[0-9]{1,2}\.[0-9]{0,6}", + "example": "123.456789", }, "title": { - "regex-rule": r'(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?(_.+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)', - "example": "ncas-cam-9_cao_20210623-215001_v1.0.jpg" + "regex-rule": r"(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?(_.+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)", + "example": "ncas-cam-9_cao_20210623-215001_v1.0.jpg", }, "title-data-product": { - "regex-rule": r'(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?_(plot|photo)((.)+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)', - "example": "ncas-cam-9_cao_20210623-215001_photo_v1.0.jpg" + "regex-rule": r"(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?_(plot|photo)((.)+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)", + "example": "ncas-cam-9_cao_20210623-215001_photo_v1.0.jpg", }, "name-format": { - "regex-rule": r'([^,])+, ([^,])+( ?[^,]+|((.)\.))', - "example": "Jones, Sam" + "regex-rule": r"([^,])+, ([^,])+( ?[^,]+|((.)\.))", + "example": "Jones, Sam", }, "name-characters": { - "regex-rule": r'[A-Za-z_À-ÿ\-\'\ \.\,]+', - "example": "Jones, Sam" + "regex-rule": r"[A-Za-z_À-ÿ\-\'\ \.\,]+", + "example": "Jones, Sam", }, "altitude-image-warning": { - "regex-rule": r'-?\d+\sm', # should be integers only for images - "example": "123 m" + "regex-rule": r"-?\d+\sm", # should be integers only for images + "example": "123 m", }, "altitude-image": { - "regex-rule": r'-?\d+(\.\d+)?\sm', - "example": "123.45 m" + "regex-rule": r"-?\d+(\.\d+)?\sm", + "example": "123.45 m", }, "ncas-email": { - "regex-rule": r'[^@\s]+@ncas.ac.uk', - "example": "sam.jones@ncas.ac.uk" - } + "regex-rule": r"[^@\s]+@ncas.ac.uk", + "example": "sam.jones@ncas.ac.uk", + }, } def _map_type_rule(self, type_rule): @@ -105,7 +101,7 @@ def _map_type_rule(self, type_rule): "int": int, "float": float, "string": str, - "str": str + "str": str, } return mappings[type_rule] @@ -138,7 +134,9 @@ def check(self, rule_lookup, value, context=None, label=""): type_rule = i.split(":")[1] if not isinstance(value, self._map_type_rule(type_rule)): - output.append(f"{label} Value '{value}' is not of required type: '{type_rule}'.") + output.append( + f"{label} Value '{value}' is not of required type: '{type_rule}'." + ) elif i.startswith("regex-rule"): regex_rule = i.split(":", 1)[1] @@ -147,15 +145,21 @@ def check(self, rule_lookup, value, context=None, label=""): pattern = self.static_regex_rules[regex_rule]["regex-rule"] if not re.match("^" + pattern + "$", value): - output.append(f"{label} Value '{value}' does not match regex rule: '{regex_rule}' - Example valid value '{self.static_regex_rules[regex_rule]['example']}'.") + output.append( + f"{label} Value '{value}' does not match regex rule: '{regex_rule}' - Example valid value '{self.static_regex_rules[regex_rule]['example']}'." + ) else: raise Exception(f"Regex rule not found with rule ID: {i}.") elif i.startswith("regex"): - pattern = i.split(":", 1)[1] # in case pattern has colons in it, e.g. a URL + pattern = i.split(":", 1)[ + 1 + ] # in case pattern has colons in it, e.g. a URL if not re.match(f"^{pattern}$", value): - output.append(f"{label} Value '{value}' does not match regular expression: '{pattern}'.") + output.append( + f"{label} Value '{value}' does not match regular expression: '{pattern}'." + ) else: raise Exception(f"Rule not found with rule ID: {i}.") @@ -164,4 +168,3 @@ def check(self, rule_lookup, value, context=None, label=""): rules = Rules() - diff --git a/checksit/specs.py b/checksit/specs.py index e45b036e..552050b3 100644 --- a/checksit/specs.py +++ b/checksit/specs.py @@ -11,13 +11,19 @@ def _parse_specs(spec_files): - return dict([(os.path.basename(f)[:-4], yaml.load(open(f), Loader=yaml.SafeLoader)) for f in spec_files]) + return dict( + [ + (os.path.basename(f)[:-4], yaml.load(open(f), Loader=yaml.SafeLoader)) + for f in spec_files + ] + ) def load_specs(spec_ids=None): spec_ids = spec_ids or [] - spec_files = [f"{specs_dir}/{spec_id}.yml" for spec_id in spec_ids] or \ - glob.glob(f"{specs_dir}/*.yml") + spec_files = [f"{specs_dir}/{spec_id}.yml" for spec_id in spec_ids] or glob.glob( + f"{specs_dir}/*.yml" + ) return _parse_specs(spec_files) @@ -29,7 +35,11 @@ def show_specs(spec_ids=None, verbose=False): if not spec_ids: specs = all_specs.items() else: - specs = [(spec_ids[spec_ids_names.index(spec_id)], spec) for (spec_id, spec) in all_specs.items() if spec_id in spec_ids_names] + specs = [ + (spec_ids[spec_ids_names.index(spec_id)], spec) + for (spec_id, spec) in all_specs.items() + if spec_id in spec_ids_names + ] print("Specifications:") for spec_id, spec in specs: @@ -63,8 +73,8 @@ def run_checks(self, record, skip_spellcheck=False): for check_id, check_dict in self.spec.items(): check_errors, check_warnings = self._run_check( - record, check_dict, skip_spellcheck=skip_spellcheck - ) + record, check_dict, skip_spellcheck=skip_spellcheck + ) errors.extend(check_errors) warnings.extend(check_warnings) diff --git a/checksit/summary.py b/checksit/summary.py index c6c27e4b..e0dcb975 100755 --- a/checksit/summary.py +++ b/checksit/summary.py @@ -17,7 +17,8 @@ def get_max_column_count(files, sep): with open(f) as reader: for line in reader: c = line.count(sep) - if c > count: count = c + if c > count: + count = c return count @@ -30,8 +31,9 @@ def do_exclude(err, exclude_patterns): return False -def summarise(log_files=None, log_directory=None, show_files=False, - exclude=None, verbose=False): +def summarise( + log_files=None, log_directory=None, show_files=False, exclude=None, verbose=False +): log_files = log_files or find_log_files(log_directory) exclude_patterns = exclude or [] @@ -39,7 +41,7 @@ def summarise(log_files=None, log_directory=None, show_files=False, print("[ERROR] No log files found!") return - if verbose: + if verbose: print(f"[INFO] Reading {len(log_files)} files:") print(f"\t{log_files[0]} ...to... {log_files[-1]}") @@ -49,7 +51,7 @@ def summarise(log_files=None, log_directory=None, show_files=False, print(f"[INFO] Max cols: {n_cols}") known_cols = ["filepath", "template", "highest_error", "error_count"] - err_cols = [f"err_{i:02d}" for i in range(n_cols-len(known_cols)+1)] + err_cols = [f"err_{i:02d}" for i in range(n_cols - len(known_cols) + 1)] headers = known_cols + err_cols print(f"Headers: {headers}") @@ -58,7 +60,7 @@ def summarise(log_files=None, log_directory=None, show_files=False, for filename in log_files: df = pd.read_csv(filename, sep=sep, index_col=None, header=None, names=headers) - df = df.replace({r"^\s*|\s*$":""}, regex=True) + df = df.replace({r"^\s*|\s*$": ""}, regex=True) df["logfile"] = os.path.basename(filename) count += len(df) li.append(df) @@ -75,7 +77,8 @@ def summarise(log_files=None, log_directory=None, show_files=False, for err_col in err_cols: for err in df[err_col].unique(): err = err.strip() - if not err or do_exclude(err, exclude_patterns): continue + if not err or do_exclude(err, exclude_patterns): + continue filepaths = sorted(df[df[err_col] == err]["filepath"]) errors_by_type[err].extend(filepaths) @@ -91,11 +94,10 @@ def summarise(log_files=None, log_directory=None, show_files=False, filepaths = all_errors[err] print(f"\t\t{err} [found in {len(filepaths)} file(s)]") - if show_files: + if show_files: print("\n------- File paths --------\n") for err in all_errors: - print(f"\t\t{err}") + print(f"\t\t{err}") for filepath in all_errors[err]: print(f"\t\t\t{filepath}") - diff --git a/checksit/utils.py b/checksit/utils.py index f8bfefce..b89093d3 100644 --- a/checksit/utils.py +++ b/checksit/utils.py @@ -33,11 +33,10 @@ def get_public_funcs(module): if item[0] != "_": prop = getattr(module, item) if inspect.isfunction(prop): - funcs.append(prop) + funcs.append(prop) return funcs def is_undefined(x): return not x and x != 0 - diff --git a/docs/source/dev/ncas_standard_specifics.rst b/docs/source/dev/ncas_standard_specifics.rst index d0628350..327e774d 100644 --- a/docs/source/dev/ncas_standard_specifics.rst +++ b/docs/source/dev/ncas_standard_specifics.rst @@ -1,16 +1,39 @@ NCAS Data Standards =================== +If given a file to check where the file name starts with ``ncas-``, as in the start of an NCAS +instrument name, and no template or specs are specified, then ``checksit`` will attempt to find specs +to check the file against depending on which NCAS Data Standard is being used. + NCAS-GENERAL ------------ Automatic use of spec files ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When given a file to check and no template option has been specified, ``checksit/check.py`` looks to see if the file is a netCDF file, and if so if it has a Conventions global attribute with ``NCAS-GENERAL``, ``NCAS-AMOF`` or ``NCAS-AMF`` it its value, identifying the file as needing to conform to the ``NCAS-GENERAL`` data standard. ``checksit`` then identifies which version of the standard is being used, using the numbers that follow the standard identifier in the Conventions attribute. The data product and deployment mode are obtained from the file, and specs for product and deployment are added to specs for global attributes and file naming for that version of the standard. +If the file is a netCDF file with the ``Conventions`` global attribute containing one of ``NCAS-GENERAL``, +``NCAS-AMOF`` or ``NCAS-AMF`` it its value, the file is identified as needing to conform to the +``NCAS-GENERAL`` data standard. ``checksit`` then identifies which version of the standard is being +used, using the numbers that follow the standard identifier in the Conventions attribute. +The data product and deployment mode are obtained from the file, and specs for product and deployment +are added to specs for global attributes and file naming for that version of the standard. Downloading of new versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If specs for the version of the standard do not exist within ``specs/groups``, ``checksit`` will attempt to download the vocabs for that version and create the spec files using the ``make_amof_specs`` function within ``checksit/make_specs.py``. However, if ``checksit`` cannot find the vocabs for that version, or does not have permission to write into the ``specs/groups`` folder, then an error is raised. +If specs for the version of the standard do not exist within ``specs/groups``, ``checksit`` will +attempt to download the vocabs for that version and create the spec files using the ``make_amof_specs`` +function within ``checksit/make_specs.py``. However, if ``checksit`` cannot find the vocabs for that +version, or does not have permission to write into the ``specs/groups`` folder, then an error is +raised. + +NCAS-IMAGE +---------- +If instead of a netCDF file ``checksit`` is checking an image file, based on the file extension being +one of ``png``, ``jpg`` or ``jpeg`` (or uppercase versions), and the file has the +``XMP-photoshop:Instructions`` metadata tag with a value mentioning the NCAS Image Standard, then +``checksit`` will find specs related to NCAS-IMAGE. The version of the standard is identified using +the ``Instructions`` tag, and specs relating to either the ``photo`` or ``plot`` data product are +selected depending on the file name. The data product spec is combined with a global tags spec file +that covers tags required by the standard regardless of which data product is used. \ No newline at end of file diff --git a/docs/source/dev/specs.rst b/docs/source/dev/specs.rst index e66e8942..6f7c52df 100644 --- a/docs/source/dev/specs.rst +++ b/docs/source/dev/specs.rst @@ -4,12 +4,14 @@ Specs Location -------- -Spec files are saved within the ``specs/groups`` folder at the top level of the ``checksit`` repository. Files can be grouped within folders in this directory if required. +Spec files are saved within the ``specs/groups`` folder at the top level of the ``checksit`` repository. +Files can be grouped within folders in this directory if required. Format of spec file ------------------- -Spec files are in YAML format, and are split into sections by function calls. An example section from a YAML spec file might look like +Spec files are in YAML format, and are split into sections by function calls. An example section from +a YAML spec file might look like .. code-block:: yaml @@ -25,17 +27,22 @@ Spec files are in YAML format, and are split into sections by function calls. An Line 1: ``required-global-attrs`` is a label which must be unique within the file. -Line 2: ``func: checksit.generic.check_global_attrs`` points to the function that will be used, in this case it's the ``check_global_attrs`` function within the ``checksit/generic.py`` file. +Line 2: ``func: checksit.generic.check_global_attrs`` points to the function that will be used, in this +case it's the ``check_global_attrs`` function within the ``checksit/generic.py`` file. -Line 3: ``params`` indicates the following section is the parameters that will be passed to the function. +Line 3: ``params`` indicates the following section is the parameters that will be passed to the +function. Line 4: ``defined_attrs`` is the name of a parameter in the function -Lines 5-6: ``source`` and ``title`` are the values being passed to the parameter. As they have a proceeding ``-``\ , they are parsed as a list, i.e. ``['source', 'title']`` +Lines 5-6: ``source`` and ``title`` are the values being passed to the parameter. As they have a +proceeding ``-``\ , they are parsed as a list, i.e. ``['source', 'title']`` Line 7: ``vocab_attrs`` is another parameter in the function -Line 8-9: Values for the ``vocab_attrs`` parameter. As these do not have a ``-`` proceeding, they are parsed as a dictionary, i.e. ``{'Conventions': '__vocabs__:cf-netcdf:Conventions', 'another_attribute': '__vocabs__:attribute_vocabs'}`` +Line 8-9: Values for the ``vocab_attrs`` parameter. As these do not have a ``-`` proceeding, they are +parsed as a dictionary, i.e. +``{'Conventions': '__vocabs__:cf-netcdf:Conventions', 'another_attribute': '__vocabs__:attribute_vocabs'}`` .. diff --git a/docs/source/dev/templates.rst b/docs/source/dev/templates.rst index 72ca20d7..1dd5ecdb 100644 --- a/docs/source/dev/templates.rst +++ b/docs/source/dev/templates.rst @@ -4,9 +4,12 @@ Templates Location -------- -Template files are stored within the ``template-cache`` folder at the top level of the ``checksit`` repository. +Template files are stored within the ``template-cache`` folder at the top level of the ``checksit`` +repository. Format ------ -Template files can be saved in a number of formats. Using the ``--auto-cache`` flag when checking a netCDF file will create a template as a CDL file, as is output by ``ncdump -h``. Templates can also be saved as a YAML file, with references to ``__vocabs__`` and ``__rules__`` included. +Template files can be saved in a number of formats. Using the ``--auto-cache`` flag when checking a +netCDF file will create a template as a CDL file, as is output by ``ncdump -h``. Templates can also be +saved as a YAML file, with references to ``__vocabs__`` and ``__rules__`` included. diff --git a/docs/source/dev/vocabs.rst b/docs/source/dev/vocabs.rst index 7c785e5d..ff5b0ee7 100644 --- a/docs/source/dev/vocabs.rst +++ b/docs/source/dev/vocabs.rst @@ -4,12 +4,14 @@ Vocabs Location -------- -Vocab files are stored in the ``checksit/vocabs`` folder, and can be grouped in folders within this directory. +Vocab files are stored in the ``checksit/vocabs`` folder, and can be grouped in folders within this +directory. Format ------ -Vocab files are stored as JSON files. When a key has a number of acceptable options, these are grouped in a list format, e.g. +Vocab files are stored as JSON files. When a key has a number of acceptable options, these are grouped +in a list format, e.g. .. code-block:: json diff --git a/docs/source/dev/where_does_checksit_do_it.rst b/docs/source/dev/where_does_checksit_do_it.rst index 3eceacf1..aab1917a 100644 --- a/docs/source/dev/where_does_checksit_do_it.rst +++ b/docs/source/dev/where_does_checksit_do_it.rst @@ -6,31 +6,52 @@ Description on some of the key parts of checksit, how they work, what to add/edi checksit working directory -------------------------- -By default, checksit needs to be run from the top level of the checksit repository. This can be changed by editing the ``basedir`` value in ``checksit/etc/checksit.ini`` to the location of the checksit repository before installing checksit. +By default, checksit needs to be run from the top level of the checksit repository. This can be changed +by editing the ``basedir`` value in ``checksit/etc/checksit.ini`` to the location of the checksit +repository before installing checksit. Readers ------- -One of the first things ``checksit`` has to do is read the file that it is being asked to check. Within ``checksit/readers`` are a number of Python scripts which includes a class that will read the file into a format suitable for ``checksit``\ , with , and a ``read`` function that returns the class in that file. This ``read`` function is called in ``checksit/check.py``. +One of the first things ``checksit`` has to do is read the file that it is being asked to check. +Within ``checksit/readers`` are a number of Python scripts which includes a class that will read the +file into a format suitable for ``checksit``\ , with , and a ``read`` function that returns the class +in that file. This ``read`` function is called in ``checksit/check.py``. Functions for specs ------------------- -Functions used by checks from spec files are in ``checksit/generic.py``. These functions take a dictionary representation of the data file (as made by the ``to_dict`` function in the reader class), the parameters that are needed for the function which have values defined in the spec file, plus the ``skip_spellcheck`` variable, which should have the default value of ``False`` (alternatively, ``**kwargs`` could be included in the function parameters instead of ``skip_spellcheck`` if the spellchecking functionality is not required). The ``skip_spellcheck`` parameter is added to the specs by ``checksit``\ , and does not need to be included in the spec YAML files. +Functions used by checks from spec files are in ``checksit/generic.py``. These functions take a +dictionary representation of the data file (as made by the ``to_dict`` function in the reader class), +the parameters that are needed for the function which have values defined in the spec file, plus the +``skip_spellcheck`` variable, which should have the default value of ``False`` (alternatively, +``**kwargs`` could be included in the function parameters instead of ``skip_spellcheck`` if the +spellchecking functionality is not required). The ``skip_spellcheck`` parameter is added to the specs +by ``checksit``\ , and does not need to be included in the spec YAML files. -The spellchecking functionality aims to spot if a file might have a spelling error in. For example, if a spec states that there should be a variable called ``time`` in the file, but one is not found, it will then look for slight misspellings of ``time``\ , although requiring the first letter to be correct. The function that does this is called ``search_close_match`` and is in the ``checksit/generic.py`` file, and can be called from other functions within this file. +The spellchecking functionality aims to spot if a file might have a spelling error in. For example, if +a spec states that there should be a variable called ``time`` in the file, but one is not found, it +will then look for slight misspellings of ``time``\ , although requiring the first letter to be +correct. The function that does this is called ``search_close_match`` and is in the +``checksit/generic.py`` file, and can be called from other functions within this file. vocabs checks ------------- -``checksit`` allows for templates and specs to define checks against known vocabularies. These vocabs are stored as JSON files within ``checksit/vocabs``\ , and can be grouped into folders within this directory. This folder is referenced through ``checksit`` as ``__vocabs__``. Defining a vocab check could look like +``checksit`` allows for templates and specs to define checks against known vocabularies. These vocabs +are stored as JSON files within ``checksit/vocabs``\ , and can be grouped into folders within this +directory. This folder is referenced through ``checksit`` as ``__vocabs__``. Defining a vocab check +could look like .. code-block:: variables: time: __vocabs__:AMF_CVs/2.0.0/AMF_product_common_variable_land:product_common_variable_land:time -which states that the ``time`` variable must match the vocab found in ``checksit/vocabs/AMF_CFs/2.0.0/AMF_product_common_variable_land.json`` (note the ``.json`` extension is excluded when specifying the vocab file), using the data in that file located by the ``product_common_variable_land`` key and then the ``time`` key. +which states that the ``time`` variable must match the vocab found in +``checksit/vocabs/AMF_CFs/2.0.0/AMF_product_common_variable_land.json`` (note the ``.json`` extension +is excluded when specifying the vocab file), using the data in that file located by the +``product_common_variable_land`` key and then the ``time`` key. An option is also included for a vocab match of one value out of many. For example, @@ -38,23 +59,47 @@ An option is also included for a vocab match of one value out of many. For examp platform: __vocabs__:AMF_CVs/2.0.0/AMF_platform:platform:__all__ -specifies ``platform`` should match one of the values found under the ``platform`` key in ``checksit/vocabs/AMF_CVs/2.0.0/AMF_platform.json``\ , and +specifies ``platform`` should match one of the values found under the ``platform`` key in +``checksit/vocabs/AMF_CVs/2.0.0/AMF_platform.json``\ , and .. code-block:: source: __vocabs__:AMF_CVs/2.0.0/AMF_ncas_instrument:ncas_instrument:__all__:description -requires ``source`` to match any of the ``description`` tags nested under the ``ncas_instrument`` key in ``checksit/vocabs/AMF_CVs/2.0.0/AMF_ncas_instrument.json``. In these cases, ``__all__`` acts similarly to the wildcard ``*`` in bash, but only one instance of ``__all__`` is allowed. +requires ``source`` to match any of the ``description`` tags nested under the ``ncas_instrument`` key +in ``checksit/vocabs/AMF_CVs/2.0.0/AMF_ncas_instrument.json``. In these cases, ``__all__`` acts +similarly to the wildcard ``*`` in bash, but only one instance of ``__all__`` is allowed. + +URL vocabs +^^^^^^^^^^ + +Vocabularies can also be hosted online, instead of being included in the ``checksit`` package. This +is particularly beneficial for vocabularies that may be updated regularly, meaning the latest changes +do not need to be downloaded and ``checksit`` does not need to be updated every time the vocabulary +is updated. These vocabularies should be accessible online as a JSON file in the same format as if it +was in the ``checksit/vocabs`` folder. + +URL vocabs are referred to using ``__URL__`` in place of ``__vocabs__``, and the ``https://`` at the +start of the URL should be omitted, for example + +.. code-block:: + + instrument: __URL__raw.githubusercontent.com/ncasuk/ncas-data-instrument-vocabs/__latest__/AMF_CVs/AMF_ncas_instrument.json:ncas_instrument:__all__ + +In this example, ``checksit`` will replace ``__latest__`` with the tag name of the latest tagged +release on GitHub. This will also happen for any URL that starts with ``raw.githubusercontent.com`` +and contains ``__latest__``. rules checks ------------ -``checksit`` also has a number of rules it can check values against when doing template and spec checks, managed by the ``Rules`` class in ``checksit/rules/rules.py``. There are four types of rules: +``checksit`` also has a number of rules it can check values against when doing template and spec +checks, managed by the ``Rules`` class in ``checksit/rules/rules.py``. There are four types of rules: * ``type-rule``\ : checks the value is of the correct type, e.g. integer, float or string * ``regex``\ : checks the value matches a given regular expression -* ``regex-rule``: checks the value matches a pre-defined regex. These are: +* ``regex-rule``: checks the value matches a pre-defined regex. These are: .. list-table:: :header-rows: 1 @@ -72,16 +117,40 @@ rules checks * - "match:vN.M" - ``r"v\d\.\d"`` * - "datetime" - - ``"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?"`` + - ``r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?"`` * - "datetime-or-na" - - ``"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(.\d+)?)" + _NOT_APPLICABLE_RULES`` + - ``r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(.\d+)?)" + _NOT_APPLICABLE_RULES`` * - "number" - ``r"-?\d+(\.\d+)?"`` - - -where ``NOT_APPLICABLE_RULES`` cover phrases such as "Not Available", "Not applicable", "N/A" and others similar. - - -* ``rule-func``\ : checks the value against a pre-defined function, which are defined in ``checksit/rules/rule_funcs.py``. Rule functions defined in this file include, for example ``match_one_of``\ , where a value mush match one option from a list, and ``string_of_length``\ , where a string must be of a defined length or longer (e.g. ``5`` or ``5+``\ ). + * - "location" + - ``r"(.)+(\,\ )(.)+"`` + * - "latitude-image" + - ``r"[\+|\-]?[0-9]{1,2}\.[0-9]{0,6}"`` + * - "longitude-image" + - ``r"[\+|\-]?1?[0-9]{1,2}\.[0-9]{0,6}"`` + * - "title" + - ``r"(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?(_.+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)"`` + * - "title-data-product" + - ``r"(.)+_(.)+_([1-2][0-9][0-9][0-9])([0][0-9]|[1][0-2])?([0-2][0-9]|[3][0-1])?-?([0-1][0-9]|[2][0-3])?([0-5][0-9])?([0-5][0-9])?_(plot|photo)((.)+)?_v([0-9]+)\.([0-9]+)\.(png|PNG|jpg|JPG|jpeg|JPEG)"`` + * - "name-format" + - ``r"([^,])+, ([^,])+( ?[^,]+|((.)\.))"`` + * - "name-characters" + - ``r"[A-Za-z_À-ÿ\-\'\ \.\,]+"`` + * - "altitude-image-warning" + - ``r"-?\d+\sm"`` + * - "altitude-image" + - ``r"-?\d+(\.\d+)?\sm"`` + * - "ncas-email" + - ``r"[^@\s]+@ncas.ac.uk"`` + + +where ``NOT_APPLICABLE_RULES`` cover phrases such as "Not Available", "Not applicable", "N/A" and +others similar. + + +* ``rule-func``\ : checks the value against a pre-defined function, which are defined in + ``checksit/rules/rule_funcs.py``. Rule functions defined in this file include, for example + ``match_one_of``\ , where a value must match one option from a list, and ``string_of_length``\ , + where a string must be of a defined length or longer (e.g. ``5`` or ``5+``\ ). diff --git a/docs/source/index.rst b/docs/source/index.rst index cf5ea6d8..1d64e064 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,9 +2,11 @@ checksit ======== checksit is a tool that provides auto-checking of files against a range of available checks. -Originally designed to be used against the NCAS Data Standards to provide conformancy checking, it has evolved to be more generic and can accomodate other compliance checking options too. +Originally designed to be used against the NCAS Data Standards to provide conformancy checking, +it has evolved to be more generic and can accomodate other compliance checking options too. -On a basic level a user can point the checksit tool at a given file and it will attempt to run some basic checks based on some matches that it will try to perform. +On a basic level a user can point the checksit tool at a given file and it will attempt to run +some basic checks based on some matches that it will try to perform. Other options include specifying the particular checks to run or to compare with known 'good' files. .. toctree:: diff --git a/docs/source/install.rst b/docs/source/install.rst index c620e25f..cb618c64 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -4,7 +4,8 @@ Installation Source ------ -Download the latest and development versions directly from GitHub into your home directory, and change directory into the repository +Download the latest and development versions directly from GitHub into your home directory, +and change directory into the repository .. code-block:: diff --git a/docs/source/specifics.rst b/docs/source/specifics.rst index a1443793..828992ad 100644 --- a/docs/source/specifics.rst +++ b/docs/source/specifics.rst @@ -6,7 +6,11 @@ File specific actions NCAS-GENERAL ------------ -Files that conform to the NCAS-GENERAL standard are recognised by ``checksit``\ , and specs referring to the correct version of the standard are automatically searched for and used by ``checksit``\ , with specs to include checking file name format, global attributes, dimensions and variables for the used deployment mode and data product. For example, for a file with data from an automatic weather station (\ ``ncas-aws-10``\ ) using version 2.0.0 of the standard, +Files that are designed to the NCAS-GENERAL standard are recognised by ``checksit``\ , and specs +referring to the correct version of the standard are automatically searched for and used by +``checksit``\ , with specs to include checking file name format, global attributes, dimensions +and variables for the used deployment mode and data product. For example, for a file with data +from an automatic weather station (\ ``ncas-aws-10``\ ) using version 2.0.0 of the standard, .. code-block:: @@ -21,6 +25,15 @@ is the same as NCAS-IMAGE ---------- -Coming soon... +The NCAS-IMAGE standard is also identified by ``checksit``\ , and the appropriate specs can be +found to check both global tags and photo or plot specific tags, i.e. +.. code-block:: + + checksit check ncas-cam-9_cao_20231117_photo_v1.0.nc + +is the same as + +.. code-block:: + checksit check -t off -s ncas-image-1.0.0/amof-image-global-attrs,ncas-image-1.0.0/amof-photo ncas-cam-9_cao_20231117_photo_v1.0.nc diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 9851d5fb..7c44391a 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -4,7 +4,10 @@ Usage Simplest check -------------- -First, ``cd`` into the ``checksit`` repository and activate the environment ``checksit`` was installed into. **As default, checksit needs to be run from the top level of the checksit repository**. For installations that followed the directions on the installation page, that will look like +First, ``cd`` into the ``checksit`` repository and activate the environment ``checksit`` was +installed into. **As default, checksit needs to be run from the top level of the checksit +repository**. For installations that followed the directions on the installation page, that +will look like .. code-block:: @@ -17,7 +20,8 @@ Then ``checksit`` can be run using the following, as an example: checksit check /badc/ukcp18/data/land-cpm/uk/2.2km/rcp85/01/rss/day/latest/rss_rcp85_land-cpm_uk_2.2km_01_day_20671201-20681130.nc -``checksit`` will then look at the file given and attempt to find either a template file to compare against or a series of specs to match with, and then print out the results of the checks. +``checksit`` will then look at the file given and attempt to find either a template file to +compare against or a series of specs to match with, and then print out the results of the checks. Specify Template ---------------- @@ -28,7 +32,8 @@ A specific template can be chosen for ``checksit`` to use using the ``-t/--templ checksit check --template=template-cache/rls_rcp85_land-cpm_uk_2.2km_01_day_19801201-19811130.cdl /badc/ukcp18/data/land-cpm/uk/2.2km/rcp85/01/rss/day/latest/rss_rcp85_land-cpm_uk_2.2km_01_day_20671201-20681130.nc -If the file being checked is a file which you might want to check other files against, a template can be created when checking this file by using the ``--auto-cache`` flag, e.g. +If the file being checked is a file which you might want to check other files against, a template +can be created when checking this file by using the ``--auto-cache`` flag, e.g. .. code-block:: @@ -37,13 +42,16 @@ If the file being checked is a file which you might want to check other files ag Specify Specs ------------- -A spec file, or number of spec files, can also be given to ``checksit`` to compare the file against, using the ``-s/--specs`` flag. These files, in YAML format, point to functions and define parameters that will be used to check the file with +A spec file, or number of spec files, can also be given to ``checksit`` to compare the file against, +using the ``-s/--specs`` flag. These files, in YAML format, point to functions and define parameters +that will be used to check the file with .. code-block:: checksit check --specs=ceda-base /badc/ukcp18/data/land-cpm/uk/2.2km/rcp85/01/rss/day/latest/rss_rcp85_land-cpm_uk_2.2km_01_day_20671201-20681130.nc -``checksit`` will still attempt to find a template, or use a given one, to check the file with. To only use specs, the template option can be switched off by specifying ``-t off``. +``checksit`` will still attempt to find a template, or use a given one, to check the file with. To +only use specs, the template option can be switched off by specifying ``-t off``. Brief other flags -----------------