Skip to content

Commit

Permalink
Merge branch 'main' into atta_q_safety
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Jan 19, 2025
2 parents c288c3b + 9842108 commit c5651ab
Show file tree
Hide file tree
Showing 523 changed files with 15,174 additions and 482 deletions.
10 changes: 9 additions & 1 deletion docs/docs/evaluating_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,12 @@ Will print:
| templates.key_val | 1 | 0.222222 | f1_micro | 0.0 | 0.7225818346056374 | 7 |
| templates.classification.multi_class.relation.default | 3 | 0.285714 | f1_micro | 0.0 | 0.779447856172277 | 6 |
| templates.classification.multi_class.relation.default | 0 | 0.181818 | f1_micro | 0.0 | 0.4105379478071894 | 19 |
| templates.key_val | 0 | 0 | f1_micro | | | 7 |
| templates.key_val | 0 | 0 | f1_micro | | | 7 |
Metadata
--------
The result object that returned by `evaluate` function contains `metadata` feature.
This feature contains the dataset and the inference engine metadata (if exists).:

This metadata can be accessed and used for further analysis or debugging.
14 changes: 14 additions & 0 deletions docs/docs/loading_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,17 @@ Here is an example of using random templates and a varying number of demonstrati
.. code-block:: python
dataset = load_dataset(card="cards.wnli",template=["templates.classification.multi_class.relation.default","templates.key_val"],num_demos=[0,1,3],demos_pool_size=100)
Metadata
--------
The `load_dataset` function result contains a metadata object. If the object is a Dataset or IterableDataset the metadata
saved under the path info.description. If the result is a dict of datasets, each dataset contains the metadata at the same path.
The metada is a dictionary which contains information about the execution, including:

* All parameters passed to the `load_dataset` function
* Execution time
* Other relevant metadata

This metadata can be accessed and used for further analysis or debugging.

2 changes: 1 addition & 1 deletion examples/evaluate_rag_using_binary_llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
# all available models for this judge are under "catalog.engines.classification"
mixtral_engine = "engines.classification.mixtral_8x7b_instruct_v01_wml"
correctness_judge_metric_mixtral = (
f"{metric_name}[{mapping_override}, model={mixtral_engine}]"
f"{metric_name}[{mapping_override}, inference_model={mixtral_engine}]"
)

metrics = [correctness_judge_metric_llama, correctness_judge_metric_mixtral]
Expand Down
42 changes: 42 additions & 0 deletions examples/evaluate_vision_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from unitxt import evaluate, load_dataset, settings
from unitxt.inference import (
CrossProviderInferenceEngine,
)

with settings.context(
disable_hf_datasets_cache=False,
allow_unverified_code=True,
):
test_dataset = load_dataset(
"benchmarks.vision[format=formats.chat_api,loader_limit=30,max_samples_per_subset=30]",
split="test",
)

# Infer
model = CrossProviderInferenceEngine(
model="llama-3-2-11b-vision-instruct", max_tokens=30, provider="watsonx"
)
"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.
For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""

predictions = model(test_dataset)
results = evaluate(predictions=predictions, data=test_dataset)

print("Global scores:")
print(results.global_scores.summary)
print("Subsets scores:")
print(results.subsets_scores.summary)

# | subset | score | score_name | num_of_instances |
# |:---------|---------:|:----------------|-------------------:|
# | ALL | 0.429583 | subsets_mean | 150 |
# | doc_vqa | 0.79103 | anls | 30 |
# | info_vqa | 0.464885 | anls | 30 |
# | chart_qa | 0.3 | relaxed_overall | 30 |
# | ai2d | 0.2 | exact_match_mm | 30 |
# | websrc | 0.392 | websrc_squad_f1 | 30 |
25 changes: 25 additions & 0 deletions prepare/augmentors/table_augmentors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
DuplicateTableColumns,
DuplicateTableRows,
InsertEmptyTableRows,
MaskColumnsNames,
ShuffleColumnsNames,
ShuffleTableColumns,
ShuffleTableRows,
TransposeTable,
TruncateTableRows,
)

operator = TransposeTable()
Expand All @@ -21,3 +26,23 @@
operator = InsertEmptyTableRows()

add_to_catalog(operator, "augmentors.table.insert_empty_rows", overwrite=True)

operator = ShuffleTableRows()

add_to_catalog(operator, "augmentors.table.shuffle_rows", overwrite=True)

operator = ShuffleTableColumns()

add_to_catalog(operator, "augmentors.table.shuffle_cols", overwrite=True)

operator = TruncateTableRows()

add_to_catalog(operator, "augmentors.table.truncate_rows", overwrite=True)

operator = MaskColumnsNames()

add_to_catalog(operator, "augmentors.table.mask_cols_names", overwrite=True)

operator = ShuffleColumnsNames()

add_to_catalog(operator, "augmentors.table.shuffle_cols_names", overwrite=True)
56 changes: 56 additions & 0 deletions prepare/benchmarks/tables_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
from collections import OrderedDict

from unitxt.benchmark import Benchmark
from unitxt.catalog import add_to_catalog
from unitxt.settings_utils import get_constants

constants = get_constants()

tables_benchmark_dir = os.path.join(
constants.catalog_dir,
"recipes",
"tables_benchmark",
)


# Recursive function to build nested benchmarks
def build_nested_benchmark(dir_path, prefix="recipes.tables_benchmark"):
nested_scenarios = OrderedDict()

for entry in sorted(os.listdir(dir_path)):
entry_path = os.path.join(dir_path, entry)
entry_name = os.fsdecode(entry)

if os.path.isdir(entry_path): # Handle subdirectories
# Recurse into subdirectory to create a nested benchmark
sub_benchmark = build_nested_benchmark(entry_path, f"{prefix}.{entry_name}")
nested_scenarios[entry_name] = sub_benchmark
else: # Handle individual JSON files
scenario_name = (
entry_name[: -len(".json")]
if entry_name.endswith(".json")
else entry_name
)
nested_scenarios[scenario_name] = f"{prefix}.{scenario_name}"

# Create a Benchmark object for the current folder
return Benchmark(nested_scenarios)


# Build the top-level benchmark
tables_benchmark_scenarios = build_nested_benchmark(tables_benchmark_dir)

benchmark = Benchmark(
tables_benchmark_scenarios.subsets,
__description__=(
"TablesBenchmark is an open-source benchmark developed by domain experts to evaluate various table-related tasks and capabilities.\n\n"
".. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/tables_benchmark.png\n"
" :alt: Optional alt text\n"
" :width: 30%\n"
" :align: center\n\n"
"Constructed using state-of-the-art benchmarking methodologies, TablesBenchmark ensures validity, robustness, and efficiency by utilizing unitxt's dynamic and flexible text processing abilities.\n\n"
"It encompasses diverse domains and evaluates a range of capabilities, with additional tasks and domains integrated over time."
),
)
add_to_catalog(benchmark, "benchmarks.tables_benchmark", overwrite=True)
25 changes: 25 additions & 0 deletions prepare/benchmarks/vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from unitxt.benchmark import Benchmark
from unitxt.catalog import add_to_catalog
from unitxt.standard import DatasetRecipe

benchmark = Benchmark(
subsets={
"doc_vqa": DatasetRecipe(
card="cards.doc_vqa.lmms_eval",
),
"info_vqa": DatasetRecipe(
card="cards.info_vqa_lmms_eval",
),
"chart_qa": DatasetRecipe(
card="cards.chart_qa_lmms_eval",
),
"ai2d": DatasetRecipe(
card="cards.ai2d",
),
"websrc": DatasetRecipe(
card="cards.websrc",
),
},
)

add_to_catalog(benchmark, "benchmarks.vision", overwrite=True)
20 changes: 12 additions & 8 deletions prepare/cards/ai2d.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from unitxt import get_from_catalog
from unitxt.blocks import LoadHF, Set, TaskCard
from unitxt.catalog import add_to_catalog
from unitxt.image_operators import ToImage
from unitxt.operators import Cast, Rename
from unitxt.templates import MultipleChoiceTemplate
from unitxt.test_utils.card import test_card

templates = get_from_catalog("templates.qa.multiple_choice.with_context.no_intro.all")
template = MultipleChoiceTemplate(
input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
choices_separator="\n",
target_field="answer",
enumerator="capitals",
)

card = TaskCard(
loader=LoadHF(path="lmms-lab/ai2d"),
preprocess_steps=[
Expand All @@ -14,18 +23,13 @@
Cast(field="answer", to="int"),
],
task="tasks.qa.multiple_choice.with_context[metrics=[metrics.exact_match_mm]]",
templates="templates.qa.multiple_choice.with_context.no_intro.all",
default_template=MultipleChoiceTemplate(
input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
choices_separator="\n",
target_field="answer",
enumerator="capitals",
),
templates=[template, *templates.items],
default_template=template,
__tags__={},
__description__=(
"AI2 Diagrams (AI2D) is a dataset of over 5000 grade school science diagrams with over 150000 rich annotations, their ground truth syntactic parses, and more than 15000 corresponding multiple choice questions."
),
)

test_card(card)
test_card(card, strict=False)
add_to_catalog(card, "cards.ai2d", overwrite=True)
19 changes: 12 additions & 7 deletions prepare/cards/chart_qa.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from unitxt import get_from_catalog
from unitxt.blocks import LoadHF, Set, TaskCard
from unitxt.catalog import add_to_catalog
from unitxt.collections_operators import Wrap
Expand All @@ -7,6 +8,13 @@
from unitxt.templates import MultiReferenceTemplate
from unitxt.test_utils.card import test_card

templates = get_from_catalog("templates.qa.with_context.all")
template = MultiReferenceTemplate(
input_format="{context}\n{question}\nAnswer the question using a single word.",
references_field="answers",
__description__="lmms-evals default template for chartqa.",
)

card = TaskCard(
loader=LoadHF(path="HuggingFaceM4/ChartQA"),
preprocess_steps=[
Expand All @@ -17,7 +25,8 @@
Set(fields={"context_type": "image"}),
],
task="tasks.qa.with_context",
templates="templates.qa.with_context.all",
templates=[template, *templates.items],
default_template=template,
__tags__={
"license": "GPL-3.0",
"multilinguality": "monolingual",
Expand All @@ -43,12 +52,8 @@
Set(fields={"context_type": "image"}),
],
task="tasks.qa.with_context.with_type[metrics=[metrics.relaxed_correctness]]",
templates="templates.qa.with_context.all",
default_template=MultiReferenceTemplate(
input_format="{context}\n{question}\nAnswer the question using a single word.",
references_field="answers",
__description__="lmms-evals default template for chartqa.",
),
templates=[template, *templates.items],
default_template=template,
__tags__={
"license": "GPL-3.0",
"multilinguality": "monolingual",
Expand Down
Loading

0 comments on commit c5651ab

Please sign in to comment.