Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fp8 implementation #1100

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Check out the help for more options:

```text
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
Expand Down Expand Up @@ -67,10 +67,9 @@ Optional arguments:
on your local machine arbitrary code present in the model repository.
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
The weight format of the exported model.
--quant-mode {int8}
--quant-mode {int8,f8e4m3,f8e5m2}
Quantization precision mode. This is used for applying full model quantization including
activations. The only currently supported choice is 'int8' for int8 quantization of both
weights and activations.
activations.
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
The library used to load the model before export. If not provided, will attempt to infer the
local checkpoint's library
Expand Down Expand Up @@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
</Tip>


Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.

```bash
optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
Expand Down
6 changes: 1 addition & 5 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--quant-mode",
type=str,
choices=["int8"],
choices=["int8", "f8e4m3", "f8e5m2"],
default=None,
help=(
"Quantization precision mode. This is used for applying full model quantization including activations. "
"The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
),
)
optional_group.add_argument(
Expand Down Expand Up @@ -365,9 +364,6 @@ def run(self):
quantization_config["trust_remote_code"] = self.args.trust_remote_code
ov_config = OVConfig(quantization_config=quantization_config)
else:
if self.args.quant_mode != "int8":
raise ValueError("Only 'int8' quantization mode is currently supported.")

quantization_config = {
"weight_format": self.args.quant_mode,
"activation_format": self.args.quant_mode,
Expand Down
31 changes: 11 additions & 20 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from optimum.configuration_utils import BaseConfig

from ..utils.import_utils import is_nncf_available
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS


if is_nncf_available():
Expand Down Expand Up @@ -638,9 +638,9 @@ def __init__(
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
reduces quantization error.
weight_format (`str`, defaults to "int8"):
Data format weights are quantized to. Possible values: ['int8'].
Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
activation_format (`str`, defaults to "int8"):
Data format activations are compressed to. Possible values: ['int8'].
Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
"""
super().__init__(
bits=bits,
Expand All @@ -658,6 +658,13 @@ def __init__(
self.overflow_fix = overflow_fix
self.smooth_quant_alpha = smooth_quant_alpha
self.activation_format = activation_format

f8_formats = ["f8e4m3", "f8e5m2"]
if self.activation_format in f8_formats and self.weight_format in f8_formats:
logger.info(
f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used."
)
self.sym = True
self.post_init()

def post_init(self):
Expand All @@ -669,24 +676,11 @@ def post_init(self):
if self.bits != 8:
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")

if self.dataset is not None:
if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
raise ValueError(
f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
)

if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
raise ValueError(
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
)

if self.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")

if self.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")


class OVConfig(BaseConfig):
CONFIG_NAME = "openvino_config.json"
Expand All @@ -711,10 +705,7 @@ def __init__(
"compression", None
) # A field for backward-compatability of training-time compression parameters
if self.quantization_config is not None:
if isinstance(self.quantization_config, OVWeightQuantizationConfig):
self.dtype = self.quantization_config.weight_format
else:
self.dtype = "int8"
self.dtype = self.quantization_config.weight_format
else:
self.dtype = dtype

Expand Down
13 changes: 8 additions & 5 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,11 +458,6 @@ def _quantize_ovbasemodel(
if calibration_dataset is None:
raise ValueError("Calibration dataset is required to run quantization.")

if quantization_config.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")
if quantization_config.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")

# Quantize model(s)
if isinstance(self.model, _OVModelForWhisper):
self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs)
Expand Down Expand Up @@ -1071,6 +1066,14 @@ def _full_quantization(
matmul=quantization_config.smooth_quant_alpha
)

q_mode_map = {
"f8e4m3": nncf.QuantizationMode.FP8_E4M3,
"f8e5m2": nncf.QuantizationMode.FP8_E5M2,
}

if quantization_config.activation_format in q_mode_map:
kwargs.update({"mode": q_mode_map[quantization_config.activation_format]})

quantized_model = nncf.quantize(
model,
calibration_dataset,
Expand Down
30 changes: 20 additions & 10 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,19 @@ class OVCLIExportTestCase(unittest.TestCase):
(
"automatic-speech-recognition",
"whisper",
"--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
"int8",
"--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
(14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25),
(14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18),
),
(
"text-generation",
"llama",
"f8e4m3",
Comment on lines +124 to +125
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I understand correctly that applying quantization to language models is the intended use case for fp8 quantization?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what the purpose of the fp8 usage is. The ticket says about LLM & diffusers at least.

"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
(13,),
(16,),
),
]

TEST_4BIT_CONFIGURATIONS = [
Expand Down Expand Up @@ -407,26 +416,27 @@ def test_exporters_cli_full_quantization(
self,
task: str,
model_type: str,
quant_mode: str,
option: str,
expected_num_fq_nodes_per_model: Tuple[int],
expected_num_f_nodes_per_model: Tuple[int],
expected_num_weight_nodes_per_model: Tuple[int],
):
with TemporaryDirectory() as tmpdir:
subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}",
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}",
shell=True,
check=True,
)
model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir)

submodels = []
models = [model]
if task == "automatic-speech-recognition":
submodels = [model.encoder, model.decoder, model.decoder_with_past]
self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels))
for i, model in enumerate(submodels):
actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes)
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"])
models = [model.encoder, model.decoder, model.decoder_with_past]
self.assertEqual(len(expected_num_f_nodes_per_model), len(models))
for i, model in enumerate(models):
actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])

def test_exporters_cli_int4_with_local_model_and_default_config(self):
with TemporaryDirectory() as tmpdir:
Expand Down
38 changes: 19 additions & 19 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,31 +202,31 @@


def get_num_quantized_nodes(model):
num_fake_quantize = 0
num_weight_nodes = {
"int8": 0,
"int4": 0,
"f4e2m1": 0,
"f8e8m0": 0,
"nf4": 0,
num_fake_nodes = 0
types_map = {
"i8": "int8",
Comment on lines +206 to +207
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"u8": "int8",
"i4": "int4",
"u4": "int4",
"f4e2m1": "f4e2m1",
"f8e8m0": "f8e8m0",
"nf4": "nf4",
"f8e4m3": "f8e4m3",
"f8e5m2": "f8e5m2",
}
num_weight_nodes = {n: 0 for n in types_map.values()}
ov_model = model if isinstance(model, ov.Model) else model.model
for elem in ov_model.get_ops():
if "FakeQuantize" in elem.name:
num_fake_quantize += 1
num_fake_nodes += 1
if "FakeConvert" in elem.name:
num_fake_nodes += 1
for i in range(elem.get_output_size()):
type_name = elem.get_output_element_type(i).get_type_name()
if type_name in ["i8", "u8"]:
num_weight_nodes["int8"] += 1
if type_name in ["i4", "u4"]:
num_weight_nodes["int4"] += 1
if type_name == "f4e2m1":
num_weight_nodes["f4e2m1"] += 1
if type_name == "f8e8m0":
num_weight_nodes["f8e8m0"] += 1
if type_name == "nf4":
num_weight_nodes["nf4"] += 1
return num_fake_quantize, num_weight_nodes
if type_name in types_map:
name = types_map[type_name]
num_weight_nodes[name] += 1
return num_fake_nodes, num_weight_nodes


@contextmanager
Expand Down