From e7c9448b86b7d15c387748a35569f849394d7881 Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Thu, 9 Jan 2025 01:32:47 +0000
Subject: [PATCH] ci/transformers: add baseline checks for test cases

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 .github/scripts/check-transformers.py     | 266 ++++++++++++++++++++++
 .github/workflows/_linux_transformers.yml |  24 +-
 2 files changed, 282 insertions(+), 8 deletions(-)
 create mode 100644 .github/scripts/check-transformers.py

diff --git a/.github/scripts/check-transformers.py b/.github/scripts/check-transformers.py
new file mode 100644
index 000000000..23061080c
--- /dev/null
+++ b/.github/scripts/check-transformers.py
@@ -0,0 +1,266 @@
+import argparse
+import sys
+
+from junitparser import JUnitXml, Error, Failure, Skipped
+
+parser = argparse.ArgumentParser()
+parser.add_argument('junitxml', nargs='+')
+args = parser.parse_args()
+
+failing_cases = {
+    'tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTest': {
+        'test_batching_equivalence': { 'flicky': True },
+    },
+    'tests.benchmark.test_benchmark.BenchmarkTest': {
+        'test_inference_encoder_decoder_with_configs': None,
+        'test_inference_fp16': None,
+        'test_inference_no_configs': None,
+        'test_inference_no_configs_only_pretrain': None,
+        'test_inference_no_model_no_architectures': None,
+        'test_inference_torchscript': None,
+        'test_inference_with_configs': None,
+        'test_save_csv_files': None,
+        'test_trace_memory': None,
+        'test_train_encoder_decoder_with_configs': None,
+        'test_train_no_configs': None,
+        'test_train_no_configs_fp16': None,
+        'test_train_with_configs': None,
+    },
+    'tests.generation.test_logits_process.LogitsProcessorTest': {
+        'test_watermarking_processor': None,
+    },
+    'tests.generation.test_utils.GenerationIntegrationTests': {
+        'test_assisted_decoding_encoder_decoder_shared_encoder': None,
+        'test_assisted_decoding_num_assistant_tokens_heuristic_schedule': None,
+        'test_assisted_generation_early_exit': None,
+        'test_custom_logits_processor': None,
+        'test_default_max_length_warning': None,
+        'test_eos_token_id_int_and_list_beam_search': None,
+        'test_eos_token_id_int_and_list_top_k_top_sampling': None,
+        'test_generate_compile_fullgraph_tiny': None,
+        'test_generated_length_assisted_generation': None,
+        'test_max_new_tokens_encoder_decoder': None,
+        'test_min_length_if_input_embeds': None,
+        'test_model_kwarg_assisted_decoding_decoder_only': None,
+        'test_model_kwarg_assisted_decoding_encoder_decoder': None,
+        'test_model_kwarg_encoder_signature_filtering': None,
+        'test_prepare_inputs_for_generation_decoder_llm': None,
+        'test_stop_sequence_stopping_criteria': None,
+    },
+    'tests.models.fuyu.test_modeling_fuyu.FuyuModelTest': {
+        'test_prompt_lookup_decoding_matches_greedy_search': None,
+    },
+    'tests.models.git.test_modeling_git.GitModelTest': {
+        'test_generate_continue_from_past_key_values': None,
+        'test_inputs_embeds_matches_input_ids': None,
+    },
+    'tests.models.hiera.test_modeling_hiera.HieraModelTest': {
+        'test_torch_fx': None,
+        'test_torch_fx_output_loss': None,
+    },
+    'tests.models.mamba.test_modeling_mamba.MambaIntegrationTests': {
+        'test_simple_generate_1_cpu': None,
+    },
+    'tests.models.pix2struct.test_modeling_pix2struct.Pix2StructModelTest': {
+        'test_new_cache_format_0': None,
+        'test_new_cache_format_1': None,
+        'test_new_cache_format_2': None,
+    },
+    'tests.models.speecht5.test_modeling_speecht5.SpeechT5ForTextToSpeechIntegrationTests': {
+        'test_batch_generation': None,
+    },
+    'tests.pipelines.test_pipelines_automatic_speech_recognition.AutomaticSpeechRecognitionPipelineTests': {
+        'test_small_model_pt_seq2seq': None,
+    },
+    'tests.pipelines.test_pipelines_common.CustomPipelineTest': {
+        'test_custom_code_with_string_tokenizer': None,
+    },
+    'tests.pipelines.test_pipelines_depth_estimation.DepthEstimationPipelineTests': {
+        'test_multiprocess': None,
+    },
+    'tests.pipelines.test_pipelines_image_to_text.ImageToTextPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.pipelines.test_pipelines_summarization.SummarizationPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.pipelines.test_pipelines_text_generation.TextGenerationPipelineTests': {
+        'test_small_model_pt': None,
+        'test_stop_sequence_stopping_criteria': None,
+    },
+    'tests.pipelines.test_pipelines_video_classification.VideoClassificationPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.pipelines.test_pipelines_visual_question_answering.VisualQuestionAnsweringPipelineTests': {
+        'test_small_model_pt_blip2': None,
+    },
+    'tests.pipelines.test_pipelines_zero_shot_image_classification.ZeroShotImageClassificationPipelineTests': {
+        'test_small_model_pt': None,
+        'test_small_model_pt_fp16': None,
+    },
+    'tests.test_pipeline_mixin.AutomaticSpeechRecognitionPipelineTests': {
+        'test_small_model_pt_seq2seq': None,
+    },
+    'tests.test_pipeline_mixin.DepthEstimationPipelineTests': {
+        'test_multiprocess': None,
+    },
+    'tests.test_pipeline_mixin.ImageToTextPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.test_pipeline_mixin.SummarizationPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.test_pipeline_mixin.TextGenerationPipelineTests': {
+        'test_small_model_pt': None,
+        'test_stop_sequence_stopping_criteria': None,
+    },
+    'tests.test_pipeline_mixin.VideoClassificationPipelineTests': {
+        'test_small_model_pt': None,
+    },
+    'tests.test_pipeline_mixin.VisualQuestionAnsweringPipelineTests': {
+        'test_small_model_pt_blip2': None,
+    },
+    'tests.test_pipeline_mixin.ZeroShotImageClassificationPipelineTests': {
+        'test_small_model_pt': None,
+        'test_small_model_pt_fp16': None,
+    },
+}
+
+new_failures = []
+known_failures = []
+new_passes = []
+flickies = []
+skipped_flickies = []
+
+def get_classname(case):
+    return ' '.join(case.classname.split())
+
+def get_name(case):
+    return ' '.join(case.name.split())
+
+def get_result(case):
+    result = "passed"
+    if case.result:
+        if isinstance(case.result[0], Error):
+            result = "error"
+        elif isinstance(case.result[0], Skipped):
+            result = "skipped"
+        elif isinstance(case.result[0], Failure):
+            result = "failed"
+    return result
+
+def get_message(case):
+    if not case.result:
+        return ""
+    return f"{case.result[0].message.splitlines()[0]}"
+
+def is_known_failure(classname, name):
+    if classname in failing_cases and name in failing_cases[classname]:
+        return True
+    return False
+
+def is_flicky(classname, name):
+    if classname in failing_cases and name in failing_cases[classname]:
+        _case = failing_cases[classname][name]
+        if _case is None:
+            return False
+        return True if 'flicky' in _case and _case['flicky'] else False
+    return False
+
+xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ]
+for idx, xml in enumerate(xmls):
+    for suite in xml:
+        for case in suite:
+            classname = get_classname(case)
+            name = get_name(case)
+            result = get_result(case)
+            flicky = is_flicky(classname, name)
+            if flicky:
+                if result == "skipped":
+                    skipped_flickies.append(case)
+                else:
+                    flickies.append(case)
+            else:
+                if result not in ["passed", "skipped"]:
+                    if is_known_failure(classname, name):
+                        known_failures.append(case)
+                    else:
+                        new_failures.append(case)
+                else:
+                    if is_known_failure(classname, name):
+                        new_passes.append(case)
+
+def print_md_row(row, print_header):
+    if print_header:
+        header = " | ".join([f"{key}" for key, _ in row.items()])
+        print(f"| {header} |")
+        header = " | ".join(["-"*len(key) for key, _ in row.items()])
+        print(f"| {header} |")
+    row = " | ".join([f"{value}" for _, value in row.items()])
+    print(f"| {row} |")
+
+def print_cases(cases):
+    print_header = True
+    for case in cases:
+        classname = get_classname(case)
+        name = get_name(case)
+        result = get_result(case)
+        message = get_message(case)
+        row = {
+            'Class name': classname,
+            'Test name': name,
+            'Status': result,
+            'Message': message,
+        }
+        print_md_row(row, print_header)
+        print_header = False
+
+printed = False
+def print_break(needed):
+    if needed:
+        print("")
+
+if new_failures:
+    print_break(printed)
+    print("### New failures")
+    print_cases(new_failures)
+    printed = True
+
+if known_failures:
+    print_break(printed)
+    print("### Known failures")
+    print_cases(known_failures)
+    printed = True
+
+if new_passes:
+    print_break(printed)
+    print("### New passes")
+    print_cases(new_passes)
+    print("")
+    print("> [NOTE]")
+    print("> Adjust baseline: some tests which previously failed now pass.")
+    printed = True
+
+if skipped_flickies:
+    print_break(printed)
+    print("### Skipped flickies")
+    print_cases(skipped_flickies)
+    print("")
+    print("> [NOTE]")
+    print("> Adjust baseline: some flicky tests are now skipped.")
+    printed = True
+
+if flickies:
+    print_break(printed)
+    print("### Flickies")
+    print_cases(flickies)
+    printed = True
+
+if new_failures:
+    sys.exit(1)
+elif new_passes:
+    sys.exit(2)
+elif skipped_flickies:
+    sys.exit(3)
+
+sys.exit(0)
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
index 602433322..790340730 100644
--- a/.github/workflows/_linux_transformers.yml
+++ b/.github/workflows/_linux_transformers.yml
@@ -87,6 +87,7 @@ jobs:
           conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test
           conda create -y -n huggingface_transformers_test python=${{ env.python }}
           source activate huggingface_transformers_test
+          pip install junitparser
       - name: Prepare Stock XPU Pytorch
         run: |
           pwd
@@ -135,7 +136,7 @@ jobs:
         run: |
           source activate huggingface_transformers_test
           cd transformers
-          python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml -k backbone tests || \
             (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
       - name: Run tests/*.py
         env:
@@ -143,14 +144,14 @@ jobs:
         run: |
           source activate huggingface_transformers_test
           cd transformers
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/*.py || true
       - name: Run tests/benchmark
         env:
           TEST_CASE: 'tests_benchmark'
         run: |
           source activate huggingface_transformers_test
           cd transformers
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/benchmark || true
       - name: Run tests/generation
         env:
           TEST_CASE: 'tests_generation'
@@ -160,7 +161,7 @@ jobs:
           # Excluding tests due to:
           # * torch.distributed.* not yet supported by XPU
           pattern="not TestFSDPGeneration"
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/generation -k "$pattern" || true
       - name: Run tests/models
         env:
           TEST_CASE: 'tests_models'
@@ -175,7 +176,7 @@ jobs:
             not test_model_parallel_equal_results and \
             not test_resize_embeddings_untied and \
             not test_resize_tokens_embeddings"
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/models -k "$pattern" || true
       - name: Run tests/pipelines
         env:
           TEST_CASE: 'tests_pipelines'
@@ -184,7 +185,7 @@ jobs:
           cd transformers
           # Some tests are known to fail w/o clear pattern
           # TODO: drop ||true after triage and fixes
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true
+          python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/pipelines || true
       - name: Run tests/trainer
         env:
           TEST_CASE: 'tests_trainer'
@@ -199,7 +200,7 @@ jobs:
             not TestTrainerDistributed and \
             not TestTrainerDistributedXPU and \
             not TestFSDPTrainer"
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \
             (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
       - name: Run tests/utils
         env:
@@ -210,13 +211,15 @@ jobs:
           # Excluding tests due to:
           # * Network proxy connection issue, reason unknown
           pattern="not test_load_img_url_timeout"
-          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \
             (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
       - name: Check for errors in tests
         run: |
           FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//')
           echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]"
           test -z "$FAILED_CASES"
+          source activate huggingface_transformers_test
+          python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml
       - name: Clean HF home directory and cache
         if: ${{ always() }}
         run: |
@@ -251,6 +254,11 @@ jobs:
               echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |"
             done
           } >> $GITHUB_STEP_SUMMARY
+      - name: Print baseline difference
+        if: ${{ ! cancelled() }}
+        run: |
+          source activate huggingface_transformers_test
+          python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true
       - name: Print failure lines
         if: ${{ ! cancelled() }}
         run: |