Merge branch 'main' into atta_q_safety

IBM · Jan 19, 2025 · c5651ab · c5651ab
2 parents c288c3b + 9842108
commit c5651ab
Show file tree

Hide file tree

Showing 523 changed files with 15,174 additions and 482 deletions.
diff --git a/docs/docs/evaluating_datasets.rst b/docs/docs/evaluating_datasets.rst
@@ -107,4 +107,12 @@ Will print:
     | templates.key_val                                     |           1 | 0.222222 | f1_micro     | 0.0                 | 0.7225818346056374 |                  7 |
     | templates.classification.multi_class.relation.default |           3 | 0.285714 | f1_micro     | 0.0                 | 0.779447856172277  |                  6 |
     | templates.classification.multi_class.relation.default |           0 | 0.181818 | f1_micro     | 0.0                 | 0.4105379478071894 |                 19 |
-    | templates.key_val                                     |           0 | 0        | f1_micro     |                     |                    |                  7 |
+    | templates.key_val                                     |           0 | 0        | f1_micro     |                     |                    |                  7 |
+
+
+Metadata
+--------
+The result object that returned by `evaluate` function contains `metadata` feature.
+This feature contains the dataset and the inference engine metadata (if exists).:
+
+This metadata can be accessed and used for further analysis or debugging.
diff --git a/docs/docs/loading_datasets.rst b/docs/docs/loading_datasets.rst
@@ -160,3 +160,17 @@ Here is an example of using random templates and a varying number of demonstrati
 .. code-block:: python
 
   dataset = load_dataset(card="cards.wnli",template=["templates.classification.multi_class.relation.default","templates.key_val"],num_demos=[0,1,3],demos_pool_size=100)
+
+
+Metadata
+--------
+The `load_dataset` function result contains a metadata object. If the object is a Dataset or IterableDataset  the metadata
+saved under the path info.description. If the result is a dict of datasets, each dataset contains the metadata at the same path.
+The metada is a dictionary which contains information about the execution, including:
+
+* All parameters passed to the `load_dataset` function
+* Execution time
+* Other relevant metadata
+
+This metadata can be accessed and used for further analysis or debugging.
+
diff --git a/examples/evaluate_rag_using_binary_llm_as_judge.py b/examples/evaluate_rag_using_binary_llm_as_judge.py
@@ -64,7 +64,7 @@
     # all available models for this judge are under "catalog.engines.classification"
     mixtral_engine = "engines.classification.mixtral_8x7b_instruct_v01_wml"
     correctness_judge_metric_mixtral = (
-        f"{metric_name}[{mapping_override}, model={mixtral_engine}]"
+        f"{metric_name}[{mapping_override}, inference_model={mixtral_engine}]"
     )
 
     metrics = [correctness_judge_metric_llama, correctness_judge_metric_mixtral]

diff --git a/examples/evaluate_vision_benchmark.py b/examples/evaluate_vision_benchmark.py
@@ -0,0 +1,42 @@
+from unitxt import evaluate, load_dataset, settings
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+
+with settings.context(
+    disable_hf_datasets_cache=False,
+    allow_unverified_code=True,
+):
+    test_dataset = load_dataset(
+        "benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
+        split="test",
+    )
+
+# Infer
+model = CrossProviderInferenceEngine(
+    model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
+)
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
+
+predictions = model(test_dataset)
+results = evaluate(predictions=predictions, data=test_dataset)
+
+print("Global scores:")
+print(results.global_scores.summary)
+print("Subsets scores:")
+print(results.subsets_scores.summary)
+
+# | subset   |    score | score_name      |   num_of_instances |
+# |:---------|---------:|:----------------|-------------------:|
+# | ALL      | 0.429583 | subsets_mean    |                150 |
+# | doc_vqa  | 0.79103  | anls            |                 30 |
+# | info_vqa | 0.464885 | anls            |                 30 |
+# | chart_qa | 0.3      | relaxed_overall |                 30 |
+# | ai2d     | 0.2      | exact_match_mm  |                 30 |
+# | websrc   | 0.392    | websrc_squad_f1 |                 30 |
diff --git a/prepare/augmentors/table_augmentors.py b/prepare/augmentors/table_augmentors.py
@@ -3,7 +3,12 @@
     DuplicateTableColumns,
     DuplicateTableRows,
     InsertEmptyTableRows,
+    MaskColumnsNames,
+    ShuffleColumnsNames,
+    ShuffleTableColumns,
+    ShuffleTableRows,
     TransposeTable,
+    TruncateTableRows,
 )
 
 operator = TransposeTable()
@@ -21,3 +26,23 @@
 operator = InsertEmptyTableRows()
 
 add_to_catalog(operator, "augmentors.table.insert_empty_rows", overwrite=True)
+
+operator = ShuffleTableRows()
+
+add_to_catalog(operator, "augmentors.table.shuffle_rows", overwrite=True)
+
+operator = ShuffleTableColumns()
+
+add_to_catalog(operator, "augmentors.table.shuffle_cols", overwrite=True)
+
+operator = TruncateTableRows()
+
+add_to_catalog(operator, "augmentors.table.truncate_rows", overwrite=True)
+
+operator = MaskColumnsNames()
+
+add_to_catalog(operator, "augmentors.table.mask_cols_names", overwrite=True)
+
+operator = ShuffleColumnsNames()
+
+add_to_catalog(operator, "augmentors.table.shuffle_cols_names", overwrite=True)
diff --git a/prepare/benchmarks/tables_benchmark.py b/prepare/benchmarks/tables_benchmark.py
@@ -0,0 +1,56 @@
+import os
+from collections import OrderedDict
+
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.settings_utils import get_constants
+
+constants = get_constants()
+
+tables_benchmark_dir = os.path.join(
+    constants.catalog_dir,
+    "recipes",
+    "tables_benchmark",
+)
+
+
+# Recursive function to build nested benchmarks
+def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
+    nested_scenarios = OrderedDict()
+
+    for entry in sorted(os.listdir(dir_path)):
+        entry_path = os.path.join(dir_path, entry)
+        entry_name = os.fsdecode(entry)
+
+        if os.path.isdir(entry_path):  # Handle subdirectories
+            # Recurse into subdirectory to create a nested benchmark
+            sub_benchmark = build_nested_benchmark(entry_path, f"{prefix}.{entry_name}")
+            nested_scenarios[entry_name] = sub_benchmark
+        else:  # Handle individual JSON files
+            scenario_name = (
+                entry_name[: -len(".json")]
+                if entry_name.endswith(".json")
+                else entry_name
+            )
+            nested_scenarios[scenario_name] = f"{prefix}.{scenario_name}"
+
+    # Create a Benchmark object for the current folder
+    return Benchmark(nested_scenarios)
+
+
+# Build the top-level benchmark
+tables_benchmark_scenarios = build_nested_benchmark(tables_benchmark_dir)
+
+benchmark = Benchmark(
+    tables_benchmark_scenarios.subsets,
+    __description__=(
+        "TablesBenchmark is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
+        ".. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/tables_benchmark.png\n"
+        "   :alt: Optional alt text\n"
+        "   :width: 30%\n"
+        "   :align: center\n\n"
+        "Constructed using state-of-the-art benchmarking methodologies, TablesBenchmark ensures validity, robustness, and efficiency by utilizing unitxt's dynamic and flexible text processing abilities.\n\n"
+        "It encompasses diverse domains and evaluates a range of capabilities, with additional tasks and domains integrated over time."
+    ),
+)
+add_to_catalog(benchmark, "benchmarks.tables_benchmark", overwrite=True)
diff --git a/prepare/benchmarks/vision.py b/prepare/benchmarks/vision.py
@@ -0,0 +1,25 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+benchmark = Benchmark(
+    subsets={
+        "doc_vqa": DatasetRecipe(
+            card="cards.doc_vqa.lmms_eval",
+        ),
+        "info_vqa": DatasetRecipe(
+            card="cards.info_vqa_lmms_eval",
+        ),
+        "chart_qa": DatasetRecipe(
+            card="cards.chart_qa_lmms_eval",
+        ),
+        "ai2d": DatasetRecipe(
+            card="cards.ai2d",
+        ),
+        "websrc": DatasetRecipe(
+            card="cards.websrc",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.vision", overwrite=True)
diff --git a/prepare/cards/ai2d.py b/prepare/cards/ai2d.py
@@ -1,10 +1,19 @@
+from unitxt import get_from_catalog
 from unitxt.blocks import LoadHF, Set, TaskCard
 from unitxt.catalog import add_to_catalog
 from unitxt.image_operators import ToImage
 from unitxt.operators import Cast, Rename
 from unitxt.templates import MultipleChoiceTemplate
 from unitxt.test_utils.card import test_card
 
+templates = get_from_catalog("templates.qa.multiple_choice.with_context.no_intro.all")
+template = MultipleChoiceTemplate(
+    input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
+    choices_separator="\n",
+    target_field="answer",
+    enumerator="capitals",
+)
+
 card = TaskCard(
     loader=LoadHF(path="lmms-lab/ai2d"),
     preprocess_steps=[
@@ -14,18 +23,13 @@
         Cast(field="answer", to="int"),
     ],
     task="tasks.qa.multiple_choice.with_context[metrics=[metrics.exact_match_mm]]",
-    templates="templates.qa.multiple_choice.with_context.no_intro.all",
-    default_template=MultipleChoiceTemplate(
-        input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
-        choices_separator="\n",
-        target_field="answer",
-        enumerator="capitals",
-    ),
+    templates=[template, *templates.items],
+    default_template=template,
     __tags__={},
     __description__=(
         "AI2 Diagrams (AI2D) is a dataset of over 5000 grade school science diagrams with over 150000 rich annotations, their ground truth syntactic parses, and more than 15000 corresponding multiple choice questions."
     ),
 )
 
-test_card(card)
+test_card(card, strict=False)
 add_to_catalog(card, "cards.ai2d", overwrite=True)
diff --git a/prepare/cards/chart_qa.py b/prepare/cards/chart_qa.py
@@ -1,3 +1,4 @@
+from unitxt import get_from_catalog
 from unitxt.blocks import LoadHF, Set, TaskCard
 from unitxt.catalog import add_to_catalog
 from unitxt.collections_operators import Wrap
@@ -7,6 +8,13 @@
 from unitxt.templates import MultiReferenceTemplate
 from unitxt.test_utils.card import test_card
 
+templates = get_from_catalog("templates.qa.with_context.all")
+template = MultiReferenceTemplate(
+    input_format="{context}\n{question}\nAnswer the question using a single word.",
+    references_field="answers",
+    __description__="lmms-evals default template for chartqa.",
+)
+
 card = TaskCard(
     loader=LoadHF(path="HuggingFaceM4/ChartQA"),
     preprocess_steps=[
@@ -17,7 +25,8 @@
         Set(fields={"context_type": "image"}),
     ],
     task="tasks.qa.with_context",
-    templates="templates.qa.with_context.all",
+    templates=[template, *templates.items],
+    default_template=template,
     __tags__={
         "license": "GPL-3.0",
         "multilinguality": "monolingual",
@@ -43,12 +52,8 @@
         Set(fields={"context_type": "image"}),
     ],
     task="tasks.qa.with_context.with_type[metrics=[metrics.relaxed_correctness]]",
-    templates="templates.qa.with_context.all",
-    default_template=MultiReferenceTemplate(
-        input_format="{context}\n{question}\nAnswer the question using a single word.",
-        references_field="answers",
-        __description__="lmms-evals default template for chartqa.",
-    ),
+    templates=[template, *templates.items],
+    default_template=template,
     __tags__={
         "license": "GPL-3.0",
         "multilinguality": "monolingual",