diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index d582c5f7..c0d4dc1f 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -173,12 +173,10 @@ def _get_documents( with open(file_path, "r", encoding="utf-8") as file: content = file.read() if _string_contains_html(content): - raise ValueError( - f"Provided markdown file {file_path} contains" - " HTML, which is currently unsupported. Please" - " format your markdown documents without the" - " use of HTML or use a different document" - " filetype." + logging.warning( + f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown" + "NOTE: Continuing this might affect your data generation quality." + "To get best results please format your markdown documents without the use of HTML or use a different document filetype." ) file_contents.append(content) filepaths.append(Path(file_path))