From e7c9448b86b7d15c387748a35569f849394d7881 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Thu, 9 Jan 2025 01:32:47 +0000 Subject: [PATCH] ci/transformers: add baseline checks for test cases Signed-off-by: Dmitry Rogozhkin --- .github/scripts/check-transformers.py | 266 ++++++++++++++++++++++ .github/workflows/_linux_transformers.yml | 24 +- 2 files changed, 282 insertions(+), 8 deletions(-) create mode 100644 .github/scripts/check-transformers.py diff --git a/.github/scripts/check-transformers.py b/.github/scripts/check-transformers.py new file mode 100644 index 000000000..23061080c --- /dev/null +++ b/.github/scripts/check-transformers.py @@ -0,0 +1,266 @@ +import argparse +import sys + +from junitparser import JUnitXml, Error, Failure, Skipped + +parser = argparse.ArgumentParser() +parser.add_argument('junitxml', nargs='+') +args = parser.parse_args() + +failing_cases = { + 'tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTest': { + 'test_batching_equivalence': { 'flicky': True }, + }, + 'tests.benchmark.test_benchmark.BenchmarkTest': { + 'test_inference_encoder_decoder_with_configs': None, + 'test_inference_fp16': None, + 'test_inference_no_configs': None, + 'test_inference_no_configs_only_pretrain': None, + 'test_inference_no_model_no_architectures': None, + 'test_inference_torchscript': None, + 'test_inference_with_configs': None, + 'test_save_csv_files': None, + 'test_trace_memory': None, + 'test_train_encoder_decoder_with_configs': None, + 'test_train_no_configs': None, + 'test_train_no_configs_fp16': None, + 'test_train_with_configs': None, + }, + 'tests.generation.test_logits_process.LogitsProcessorTest': { + 'test_watermarking_processor': None, + }, + 'tests.generation.test_utils.GenerationIntegrationTests': { + 'test_assisted_decoding_encoder_decoder_shared_encoder': None, + 'test_assisted_decoding_num_assistant_tokens_heuristic_schedule': None, + 'test_assisted_generation_early_exit': None, + 'test_custom_logits_processor': None, + 'test_default_max_length_warning': None, + 'test_eos_token_id_int_and_list_beam_search': None, + 'test_eos_token_id_int_and_list_top_k_top_sampling': None, + 'test_generate_compile_fullgraph_tiny': None, + 'test_generated_length_assisted_generation': None, + 'test_max_new_tokens_encoder_decoder': None, + 'test_min_length_if_input_embeds': None, + 'test_model_kwarg_assisted_decoding_decoder_only': None, + 'test_model_kwarg_assisted_decoding_encoder_decoder': None, + 'test_model_kwarg_encoder_signature_filtering': None, + 'test_prepare_inputs_for_generation_decoder_llm': None, + 'test_stop_sequence_stopping_criteria': None, + }, + 'tests.models.fuyu.test_modeling_fuyu.FuyuModelTest': { + 'test_prompt_lookup_decoding_matches_greedy_search': None, + }, + 'tests.models.git.test_modeling_git.GitModelTest': { + 'test_generate_continue_from_past_key_values': None, + 'test_inputs_embeds_matches_input_ids': None, + }, + 'tests.models.hiera.test_modeling_hiera.HieraModelTest': { + 'test_torch_fx': None, + 'test_torch_fx_output_loss': None, + }, + 'tests.models.mamba.test_modeling_mamba.MambaIntegrationTests': { + 'test_simple_generate_1_cpu': None, + }, + 'tests.models.pix2struct.test_modeling_pix2struct.Pix2StructModelTest': { + 'test_new_cache_format_0': None, + 'test_new_cache_format_1': None, + 'test_new_cache_format_2': None, + }, + 'tests.models.speecht5.test_modeling_speecht5.SpeechT5ForTextToSpeechIntegrationTests': { + 'test_batch_generation': None, + }, + 'tests.pipelines.test_pipelines_automatic_speech_recognition.AutomaticSpeechRecognitionPipelineTests': { + 'test_small_model_pt_seq2seq': None, + }, + 'tests.pipelines.test_pipelines_common.CustomPipelineTest': { + 'test_custom_code_with_string_tokenizer': None, + }, + 'tests.pipelines.test_pipelines_depth_estimation.DepthEstimationPipelineTests': { + 'test_multiprocess': None, + }, + 'tests.pipelines.test_pipelines_image_to_text.ImageToTextPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.pipelines.test_pipelines_summarization.SummarizationPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.pipelines.test_pipelines_text_generation.TextGenerationPipelineTests': { + 'test_small_model_pt': None, + 'test_stop_sequence_stopping_criteria': None, + }, + 'tests.pipelines.test_pipelines_video_classification.VideoClassificationPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.pipelines.test_pipelines_visual_question_answering.VisualQuestionAnsweringPipelineTests': { + 'test_small_model_pt_blip2': None, + }, + 'tests.pipelines.test_pipelines_zero_shot_image_classification.ZeroShotImageClassificationPipelineTests': { + 'test_small_model_pt': None, + 'test_small_model_pt_fp16': None, + }, + 'tests.test_pipeline_mixin.AutomaticSpeechRecognitionPipelineTests': { + 'test_small_model_pt_seq2seq': None, + }, + 'tests.test_pipeline_mixin.DepthEstimationPipelineTests': { + 'test_multiprocess': None, + }, + 'tests.test_pipeline_mixin.ImageToTextPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.test_pipeline_mixin.SummarizationPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.test_pipeline_mixin.TextGenerationPipelineTests': { + 'test_small_model_pt': None, + 'test_stop_sequence_stopping_criteria': None, + }, + 'tests.test_pipeline_mixin.VideoClassificationPipelineTests': { + 'test_small_model_pt': None, + }, + 'tests.test_pipeline_mixin.VisualQuestionAnsweringPipelineTests': { + 'test_small_model_pt_blip2': None, + }, + 'tests.test_pipeline_mixin.ZeroShotImageClassificationPipelineTests': { + 'test_small_model_pt': None, + 'test_small_model_pt_fp16': None, + }, +} + +new_failures = [] +known_failures = [] +new_passes = [] +flickies = [] +skipped_flickies = [] + +def get_classname(case): + return ' '.join(case.classname.split()) + +def get_name(case): + return ' '.join(case.name.split()) + +def get_result(case): + result = "passed" + if case.result: + if isinstance(case.result[0], Error): + result = "error" + elif isinstance(case.result[0], Skipped): + result = "skipped" + elif isinstance(case.result[0], Failure): + result = "failed" + return result + +def get_message(case): + if not case.result: + return "" + return f"{case.result[0].message.splitlines()[0]}" + +def is_known_failure(classname, name): + if classname in failing_cases and name in failing_cases[classname]: + return True + return False + +def is_flicky(classname, name): + if classname in failing_cases and name in failing_cases[classname]: + _case = failing_cases[classname][name] + if _case is None: + return False + return True if 'flicky' in _case and _case['flicky'] else False + return False + +xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ] +for idx, xml in enumerate(xmls): + for suite in xml: + for case in suite: + classname = get_classname(case) + name = get_name(case) + result = get_result(case) + flicky = is_flicky(classname, name) + if flicky: + if result == "skipped": + skipped_flickies.append(case) + else: + flickies.append(case) + else: + if result not in ["passed", "skipped"]: + if is_known_failure(classname, name): + known_failures.append(case) + else: + new_failures.append(case) + else: + if is_known_failure(classname, name): + new_passes.append(case) + +def print_md_row(row, print_header): + if print_header: + header = " | ".join([f"{key}" for key, _ in row.items()]) + print(f"| {header} |") + header = " | ".join(["-"*len(key) for key, _ in row.items()]) + print(f"| {header} |") + row = " | ".join([f"{value}" for _, value in row.items()]) + print(f"| {row} |") + +def print_cases(cases): + print_header = True + for case in cases: + classname = get_classname(case) + name = get_name(case) + result = get_result(case) + message = get_message(case) + row = { + 'Class name': classname, + 'Test name': name, + 'Status': result, + 'Message': message, + } + print_md_row(row, print_header) + print_header = False + +printed = False +def print_break(needed): + if needed: + print("") + +if new_failures: + print_break(printed) + print("### New failures") + print_cases(new_failures) + printed = True + +if known_failures: + print_break(printed) + print("### Known failures") + print_cases(known_failures) + printed = True + +if new_passes: + print_break(printed) + print("### New passes") + print_cases(new_passes) + print("") + print("> [NOTE]") + print("> Adjust baseline: some tests which previously failed now pass.") + printed = True + +if skipped_flickies: + print_break(printed) + print("### Skipped flickies") + print_cases(skipped_flickies) + print("") + print("> [NOTE]") + print("> Adjust baseline: some flicky tests are now skipped.") + printed = True + +if flickies: + print_break(printed) + print("### Flickies") + print_cases(flickies) + printed = True + +if new_failures: + sys.exit(1) +elif new_passes: + sys.exit(2) +elif skipped_flickies: + sys.exit(3) + +sys.exit(0) diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index 602433322..790340730 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -87,6 +87,7 @@ jobs: conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test conda create -y -n huggingface_transformers_test python=${{ env.python }} source activate huggingface_transformers_test + pip install junitparser - name: Prepare Stock XPU Pytorch run: | pwd @@ -135,7 +136,7 @@ jobs: run: | source activate huggingface_transformers_test cd transformers - python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \ + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml -k backbone tests || \ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) - name: Run tests/*.py env: @@ -143,14 +144,14 @@ jobs: run: | source activate huggingface_transformers_test cd transformers - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/*.py || true - name: Run tests/benchmark env: TEST_CASE: 'tests_benchmark' run: | source activate huggingface_transformers_test cd transformers - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/benchmark || true - name: Run tests/generation env: TEST_CASE: 'tests_generation' @@ -160,7 +161,7 @@ jobs: # Excluding tests due to: # * torch.distributed.* not yet supported by XPU pattern="not TestFSDPGeneration" - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/generation -k "$pattern" || true - name: Run tests/models env: TEST_CASE: 'tests_models' @@ -175,7 +176,7 @@ jobs: not test_model_parallel_equal_results and \ not test_resize_embeddings_untied and \ not test_resize_tokens_embeddings" - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/models -k "$pattern" || true - name: Run tests/pipelines env: TEST_CASE: 'tests_pipelines' @@ -184,7 +185,7 @@ jobs: cd transformers # Some tests are known to fail w/o clear pattern # TODO: drop ||true after triage and fixes - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true + python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/pipelines || true - name: Run tests/trainer env: TEST_CASE: 'tests_trainer' @@ -199,7 +200,7 @@ jobs: not TestTrainerDistributed and \ not TestTrainerDistributedXPU and \ not TestFSDPTrainer" - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \ + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) - name: Run tests/utils env: @@ -210,13 +211,15 @@ jobs: # Excluding tests due to: # * Network proxy connection issue, reason unknown pattern="not test_load_img_url_timeout" - python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \ + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) - name: Check for errors in tests run: | FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//') echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]" test -z "$FAILED_CASES" + source activate huggingface_transformers_test + python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml - name: Clean HF home directory and cache if: ${{ always() }} run: | @@ -251,6 +254,11 @@ jobs: echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |" done } >> $GITHUB_STEP_SUMMARY + - name: Print baseline difference + if: ${{ ! cancelled() }} + run: | + source activate huggingface_transformers_test + python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true - name: Print failure lines if: ${{ ! cancelled() }} run: |