From e475b9d38da42b65f1502a8b146047acfea34642 Mon Sep 17 00:00:00 2001 From: BJ Hargrave Date: Mon, 1 Jul 2024 17:53:07 -0400 Subject: [PATCH] WIP: Use instructlab-schema package to parse qna.yaml files We remove code duplicated with instructlab to use the shared code from instructlab-schema package. See https://github.com/instructlab/schema/pull/33 Signed-off-by: BJ Hargrave --- .pylintrc | 3 +- requirements.txt | 1 + src/instructlab/sdg/utils/taxonomy.py | 248 +++----------------------- 3 files changed, 28 insertions(+), 224 deletions(-) diff --git a/.pylintrc b/.pylintrc index 3b4da7a8..9a5bd9d8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -379,7 +379,8 @@ int-import-graph= known-standard-library= # Force import order to recognize a module as part of a third party library. -known-third-party=enchant +known-third-party=enchant, + instructlab.schema, # Couples of modules and preferred modules, separated by a comma. preferred-modules= diff --git a/requirements.txt b/requirements.txt index 24c3ab8f..09033303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 httpx>=0.25.0,<1.0.0 +instructlab-schema @ git+https://github.com/instructlab/schema.git@refs/pull/33/head jinja2 langchain-text-splitters openai>=1.13.3,<2.0.0 diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index d6f6441b..099379b1 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -1,44 +1,30 @@ # SPDX-License-Identifier: Apache-2.0 # Standard -from functools import cache from pathlib import Path -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Dict, List, Optional, Union import glob -import json import logging import os import re -import subprocess import tempfile # Third Party +from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS +from instructlab.schema.taxonomy import ( + TaxonomyMessageFormat, + TaxonomyParser, + TaxonomyReadingException, +) import git import gitdb import yaml -# First Party -from instructlab.sdg import utils -from instructlab.sdg.utils import chunking +# Local +from . import GenerateException, chunking logger = logging.getLogger(__name__) -DEFAULT_YAML_RULES = """\ -extends: relaxed - -rules: - line-length: - max: 120 -""" - - -class TaxonomyReadingException(Exception): - """An exception raised during reading of the taxonomy.""" - - -TAXONOMY_FOLDERS: List[str] = ["compositional_skills", "knowledge"] -"""Taxonomy folders which are also the schema names""" - def _istaxonomyfile(fn): path = Path(fn) @@ -133,207 +119,24 @@ def _get_documents( raise e -@cache -def _load_schema(path: "importlib.resources.abc.Traversable") -> "referencing.Resource": - """Load the schema from the path into a Resource object. - - Args: - path (Traversable): Path to the schema to be loaded. - - Raises: - NoSuchResource: If the resource cannot be loaded. - - Returns: - Resource: A Resource containing the requested schema. - """ - # pylint: disable=C0415 - # Third Party - from referencing import Resource - from referencing.exceptions import NoSuchResource - from referencing.jsonschema import DRAFT202012 - - try: - contents = json.loads(path.read_text(encoding="utf-8")) - resource = Resource.from_contents( - contents=contents, default_specification=DRAFT202012 - ) - except Exception as e: - raise NoSuchResource(ref=str(path)) from e - return resource - - -def _validate_yaml(contents: Mapping[str, Any], taxonomy_path: Path) -> int: - """Validate the parsed yaml document using the taxonomy path to - determine the proper schema. - - Args: - contents (Mapping): The parsed yaml document to validate against the schema. - taxonomy_path (Path): Relative path of the taxonomy yaml document where the - first element is the schema to use. - - Returns: - int: The number of errors found during validation. - Messages for each error have been logged. - """ - # pylint: disable=C0415 - # Standard - from importlib import resources - - # Third Party - from jsonschema.protocols import Validator - from jsonschema.validators import validator_for - from referencing import Registry, Resource - from referencing.exceptions import NoSuchResource - from referencing.typing import URI - - errors = 0 - version = _get_version(contents) - schemas_path = resources.files("instructlab.schema").joinpath(f"v{version}") - - def retrieve(uri: URI) -> Resource: - path = schemas_path.joinpath(uri) - return _load_schema(path) - - schema_name = taxonomy_path.parts[0] - if schema_name not in TAXONOMY_FOLDERS: - schema_name = "knowledge" if "document" in contents else "compositional_skills" - logger.info( - f"Cannot determine schema name from path {taxonomy_path}. Using {schema_name} schema." - ) - - try: - schema_resource = retrieve(f"{schema_name}.json") - schema = schema_resource.contents - validator_cls = validator_for(schema) - validator: Validator = validator_cls( - schema, registry=Registry(retrieve=retrieve) - ) - - for validation_error in validator.iter_errors(contents): - errors += 1 - yaml_path = validation_error.json_path[1:] - if not yaml_path: - yaml_path = "." - if validation_error.validator == "minItems": - # Special handling for minItems which can have a long message for seed_examples - message = ( - f"Value must have at least {validation_error.validator_value} items" - ) - else: - message = validation_error.message[-200:] - logger.error( - f"Validation error in {taxonomy_path}: [{yaml_path}] {message}" - ) - except NoSuchResource as e: - cause = e.__cause__ if e.__cause__ is not None else e - errors += 1 - logger.error(f"Cannot load schema file {e.ref}. {cause}") - - return errors - - -def _get_version(contents: Mapping) -> int: - version = contents.get("version", 1) - if not isinstance(version, int): - # schema validation will complain about the type - try: - version = int(version) - except ValueError: - version = 1 # fallback to version 1 - return version - - # pylint: disable=broad-exception-caught def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): + parser = TaxonomyParser( + schema_version=0, # Use version value in yaml + message_format=TaxonomyMessageFormat.LOGGING, # Report warnings and errors to the logger + yamllint_config=yaml_rules, + yamllint_strict=True, # Report yamllint warnings as errors + ) + taxonomy = parser.parse(file_path) + + if taxonomy.errors or taxonomy.warnings: + return None, taxonomy.warnings, taxonomy.errors + seed_instruction_data = [] - warnings = 0 - errors = 0 - file_path = Path(file_path).resolve() - # file should end with ".yaml" explicitly - if file_path.suffix != ".yaml": - logger.warning( - f"Skipping {file_path}! Use lowercase '.yaml' extension instead." - ) - warnings += 1 - return None, warnings, errors - for i in range(len(file_path.parts) - 1, -1, -1): - if file_path.parts[i] in TAXONOMY_FOLDERS: - taxonomy_path = Path(*file_path.parts[i:]) - break - else: - taxonomy_path = file_path - # read file if extension is correct try: - with open(file_path, "r", encoding="utf-8") as file: - contents = yaml.safe_load(file) - if not contents: - logger.warning(f"Skipping {file_path} because it is empty!") - warnings += 1 - return None, warnings, errors - if not isinstance(contents, Mapping): - logger.error( - f"{file_path} is not valid. The top-level element is not an object with key-value pairs." - ) - errors += 1 - return None, warnings, errors - - # do general YAML linting if specified - version = _get_version(contents) - if version > 1: # no linting for version 1 yaml - if yaml_rules is not None: - is_file = os.path.isfile(yaml_rules) - if is_file: - logger.debug(f"Using YAML rules from {yaml_rules}") - yamllint_cmd = [ - "yamllint", - "-f", - "parsable", - "-c", - yaml_rules, - file_path, - "-s", - ] - else: - logger.debug(f"Cannot find {yaml_rules}. Using default rules.") - yamllint_cmd = [ - "yamllint", - "-f", - "parsable", - "-d", - DEFAULT_YAML_RULES, - file_path, - "-s", - ] - else: - yamllint_cmd = [ - "yamllint", - "-f", - "parsable", - "-d", - DEFAULT_YAML_RULES, - file_path, - "-s", - ] - try: - subprocess.check_output(yamllint_cmd, text=True) - except subprocess.SubprocessError as e: - lint_messages = [f"Problems found in file {file_path}"] - parsed_output = e.output.splitlines() - for p in parsed_output: - errors += 1 - delim = str(file_path) + ":" - parsed_p = p.split(delim)[1] - lint_messages.append(parsed_p) - logger.error("\n".join(lint_messages)) - return None, warnings, errors - - validation_errors = _validate_yaml(contents, taxonomy_path) - if validation_errors: - errors += validation_errors - return None, warnings, errors - # get seed instruction data - tax_path = "->".join(taxonomy_path.parent.parts) + tax_path = "->".join(taxonomy.path.parent.parts) + contents = taxonomy.parsed task_description = contents.get("task_description") domain = contents.get("domain") documents = contents.get("document") @@ -341,7 +144,7 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): documents = _get_documents(source=documents) logger.debug("Content from git repo fetched") - for seed_example in contents.get("seed_examples"): + for seed_example in contents.get("seed_examples", []): question = seed_example.get("question") answer = seed_example.get("answer") context = seed_example.get("context", "") @@ -357,10 +160,9 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None): } ) except Exception as e: - errors += 1 raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e - return seed_instruction_data, warnings, errors + return seed_instruction_data, 0, 0 def read_taxonomy(taxonomy, taxonomy_base, yaml_rules): @@ -440,7 +242,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count samples[-1].setdefault("domain", domain) samples[-1].setdefault("document", chunk) if samples[-1].get("document") and not samples[-1].get("domain"): - raise utils.GenerateException( + raise GenerateException( "Error: No domain provided for knowledge document in leaf node" ) if "icl_query_3" in samples[-1]: