diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index c0360322ea..0060dfabbf 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -34,6 +34,8 @@ outputs = cls_pipe("He's a dreadful magician.") [{'label': 'NEGATIVE', 'score': 0.9919503927230835}] ``` +See the [reference documentation](reference_ov) for more information about parameters, and examples for different tasks. + To easily save the resulting model, you can use the `save_pretrained()` method, which will save both the BIN and XML files describing the graph. It is useful to save the tokenizer to the same directory, to enable easy loading of the tokenizer for the model. diff --git a/docs/source/reference_ov.mdx b/docs/source/reference_ov.mdx index 8bc111d594..4c5ede653e 100644 --- a/docs/source/reference_ov.mdx +++ b/docs/source/reference_ov.mdx @@ -36,11 +36,21 @@ limitations under the License. [[autodoc]] openvino.modeling.OVModelForTokenClassification - ## OVModelForAudioClassification [[autodoc]] openvino.modeling.OVModelForAudioClassification +## OVModelForAudioFrameClassification + +[[autodoc]] openvino.modeling.OVModelForAudioFrameClassification + +## OVModelForCTC + +[[autodoc]] openvino.modeling.OVModelForCTC + +## OVModelForAudioXVector + +[[autodoc]] openvino.modeling.OVModelForAudioXVector ## OVModelForImageClassification diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 65e39d365f..8e3f7619a2 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -87,6 +87,9 @@ _import_structure["openvino"].extend( [ "OVModelForAudioClassification", + "OVModelForAudioFrameClassification", + "OVModelForAudioXVector", + "OVModelForCTC", "OVModelForCausalLM", "OVModelForFeatureExtraction", "OVModelForImageClassification", @@ -176,7 +179,10 @@ else: from .openvino import ( OVModelForAudioClassification, + OVModelForAudioFrameClassification, + OVModelForAudioXVector, OVModelForCausalLM, + OVModelForCTC, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index cfbac71fd1..fd0806cbcc 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -35,6 +35,9 @@ from .modeling import ( OVModelForAudioClassification, + OVModelForAudioFrameClassification, + OVModelForAudioXVector, + OVModelForCTC, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index b999a4116e..eb1f7a410f 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -23,6 +23,9 @@ AutoConfig, AutoModel, AutoModelForAudioClassification, + AutoModelForAudioFrameClassification, + AutoModelForAudioXVector, + AutoModelForCTC, AutoModelForImageClassification, AutoModelForMaskedLM, AutoModelForQuestionAnswering, @@ -32,13 +35,17 @@ from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_outputs import ( BaseModelOutput, + CausalLMOutput, ImageClassifierOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, + XVectorOutput, ) +from optimum.exporters import TasksManager + from .modeling_base import OVBaseModel @@ -93,6 +100,13 @@ Pixel values can be obtained from encoded images using [`AutoFeatureExtractor`](https://huggingface.co/docs/transformers/autoclass_tutorial#autofeatureextractor). """ +AUDIO_INPUTS_DOCSTRING = r""" + Args: + input_values (`torch.Tensor` of shape `({0})`): + Float values of input raw speech waveform.. + Input values can be obtained from audio file loaded into an array using [`AutoFeatureExtractor`](https://huggingface.co/docs/transformers/autoclass_tutorial#autofeatureextractor). +""" + class OVModel(OVBaseModel): base_model_prefix = "openvino_model" @@ -575,3 +589,234 @@ def forward( outputs = self.request(inputs) logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] return SequenceClassifierOutput(logits=logits) + + +CTC_EXAMPLE = r""" + Example of CTC: + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel import {model_class} + >>> from datasets import load_dataset + + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = dataset.sort("id") + >>> sampling_rate = dataset.features["audio"].sampling_rate + + >>> processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}", export=True) + + >>> # audio file is decoded on the fly + >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="np") + >>> logits = model(**inputs).logits + >>> predicted_ids = np.argmax(logits, axis=-1) + + >>> transcription = processor.batch_decode(predicted_ids) + ``` +""" + + +@add_start_docstrings( + """ + Onnx Model with a language modeling head on top for Connectionist Temporal Classification (CTC). + """, + MODEL_START_DOCSTRING, +) +class OVModelForCTC(OVModel): + """ + CTC model for OpenVINO. + """ + + auto_model_class = AutoModelForCTC + export_feature = TasksManager.infer_task_from_model(auto_model_class) + + @add_start_docstrings_to_model_forward( + AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length") + + CTC_EXAMPLE.format( + processor_class=_FEATURE_EXTRACTOR_FOR_DOC, + model_class="OVModelForCTC", + checkpoint="facebook/hubert-large-ls960-ft", + ) + ) + def forward( + self, + input_values: Optional[torch.Tensor] = None, + attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None, + **kwargs, + ): + np_inputs = isinstance(input_values, np.ndarray) + if not np_inputs: + input_values = np.array(input_values) + attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask + + inputs = { + "input_values": input_values, + } + + # Add the attention_mask when needed + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + # Run inference + outputs = self.request(inputs) + logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + return CausalLMOutput(logits=logits) + + +AUDIO_XVECTOR_EXAMPLE = r""" + Example of Audio XVector: + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel import {model_class} + >>> from datasets import load_dataset + >>> import torch + + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = dataset.sort("id") + >>> sampling_rate = dataset.features["audio"].sampling_rate + + >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}", export=True) + + >>> # audio file is decoded on the fly + >>> inputs = feature_extractor( + ... [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True + ... ) + >>> embeddings = model(**inputs).embeddings + + >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu() + + >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1) + >>> similarity = cosine_sim(embeddings[0], embeddings[1]) + >>> threshold = 0.7 + >>> if similarity < threshold: + ... print("Speakers are not the same!") + >>> round(similarity.item(), 2) + ``` +""" + + +@add_start_docstrings( + """ + Onnx Model with an XVector feature extraction head on top for tasks like Speaker Verification. + """, + MODEL_START_DOCSTRING, +) +class OVModelForAudioXVector(OVModel): + """ + Audio XVector model for OpenVINO. + """ + + auto_model_class = AutoModelForAudioXVector + export_feature = TasksManager.infer_task_from_model(auto_model_class) + + @add_start_docstrings_to_model_forward( + AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length") + + AUDIO_XVECTOR_EXAMPLE.format( + processor_class=_FEATURE_EXTRACTOR_FOR_DOC, + model_class="OVModelForAudioXVector", + checkpoint="anton-l/wav2vec2-base-superb-sv", + ) + ) + def forward( + self, + input_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ): + np_inputs = isinstance(input_values, np.ndarray) + if not np_inputs: + input_values = np.array(input_values) + attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask + + inputs = { + "input_values": input_values, + } + + # Add the attention_mask when needed + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + # Run inference + outputs = self.request(inputs) + logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + embeddings = ( + torch.from_numpy(outputs["embeddings"]).to(self.device) if not np_inputs else outputs["embeddings"] + ) + + return XVectorOutput(logits=logits, embeddings=embeddings) + + +AUDIO_FRAME_CLASSIFICATION_EXAMPLE = r""" + Example of audio frame classification: + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel import {model_class} + >>> from datasets import load_dataset + >>> import torch + + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = dataset.sort("id") + >>> sampling_rate = dataset.features["audio"].sampling_rate + + >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}", export=True) + + >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate) + >>> logits = model(**inputs).logits + + >>> probabilities = torch.sigmoid(torch.as_tensor(logits)[0]) + >>> labels = (probabilities > 0.5).long() + >>> labels[0].tolist() + ``` +""" + + +@add_start_docstrings( + """ + OpenVINO Model for with a frame classification head on top for tasks like Speaker Diarization. + """, + MODEL_START_DOCSTRING, +) +class OVModelForAudioFrameClassification(OVModel): + """ + Audio Frame Classification model for OpenVINO. + """ + + auto_model_class = AutoModelForAudioFrameClassification + export_feature = TasksManager.infer_task_from_model(auto_model_class) + + @add_start_docstrings_to_model_forward( + AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length") + + AUDIO_FRAME_CLASSIFICATION_EXAMPLE.format( + processor_class=_FEATURE_EXTRACTOR_FOR_DOC, + model_class="OVModelForAudioFrameClassification", + checkpoint="anton-l/wav2vec2-base-superb-sd", + ) + ) + def forward( + self, + input_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ): + np_inputs = isinstance(input_values, np.ndarray) + if not np_inputs: + input_values = np.array(input_values) + attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask + + inputs = { + "input_values": input_values, + } + + # Add the attention_mask when needed + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + # Run inference + outputs = self.request(inputs) + logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] + + return TokenClassifierOutput(logits=logits) diff --git a/optimum/intel/utils/dummy_openvino_objects.py b/optimum/intel/utils/dummy_openvino_objects.py index ff5be62360..b7c4939a72 100644 --- a/optimum/intel/utils/dummy_openvino_objects.py +++ b/optimum/intel/utils/dummy_openvino_objects.py @@ -26,6 +26,39 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino"]) +class OVModelForAudioFrameClassification(metaclass=DummyObject): + _backends = ["openvino"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino"]) + + +class OVModelForAudioXVector(metaclass=DummyObject): + _backends = ["openvino"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino"]) + + +class OVModelForCTC(metaclass=DummyObject): + _backends = ["openvino"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino"]) + + class OVModelForCausalLM(metaclass=DummyObject): _backends = ["openvino"] diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2461481f59..e1833ff23a 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -30,7 +30,10 @@ AutoFeatureExtractor, AutoModel, AutoModelForAudioClassification, + AutoModelForAudioFrameClassification, + AutoModelForAudioXVector, AutoModelForCausalLM, + AutoModelForCTC, AutoModelForImageClassification, AutoModelForMaskedLM, AutoModelForQuestionAnswering, @@ -45,13 +48,12 @@ ) from utils_tests import MODEL_NAMES -from optimum.intel.openvino import ( - OV_DECODER_NAME, - OV_DECODER_WITH_PAST_NAME, - OV_ENCODER_NAME, - OV_XML_FILE_NAME, +from optimum.intel import ( OVModelForAudioClassification, + OVModelForAudioFrameClassification, + OVModelForAudioXVector, OVModelForCausalLM, + OVModelForCTC, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -61,6 +63,7 @@ OVModelForTokenClassification, OVStableDiffusionPipeline, ) +from optimum.intel.openvino import OV_DECODER_NAME, OV_DECODER_WITH_PAST_NAME, OV_ENCODER_NAME, OV_XML_FILE_NAME from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder from optimum.utils import ( DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -797,3 +800,158 @@ def test_pipeline(self, model_arch): outputs = pipe([np.random.random(16000)]) self.assertEqual(pipe.device, model.device) self.assertTrue(all(item["score"] > 0.0 for item in outputs[0])) + + +class OVModelForCTCIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = [ + "data2vec_audio", + "hubert", + "sew", + "sew_d", + "unispeech", + "unispeech_sat", + "wavlm", + "wav2vec2-hf", + "wav2vec2-conformer", + ] + + def _generate_random_audio_data(self): + np.random.seed(10) + t = np.linspace(0, 5.0, int(5.0 * 22050), endpoint=False) + # generate pure sine wave at 220 Hz + audio_data = 0.5 * np.sin(2 * np.pi * 220 * t) + return audio_data + + def test_load_vanilla_transformers_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = OVModelForCTC.from_pretrained(MODEL_NAMES["t5"], export=True) + + self.assertIn("Unrecognized configuration class", str(context.exception)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForCTC.from_pretrained(model_id, export=True) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + set_seed(SEED) + transformers_model = AutoModelForCTC.from_pretrained(model_id) + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(self._generate_random_audio_data(), return_tensors="pt") + + with torch.no_grad(): + transformers_outputs = transformers_model(**input_values) + + for input_type in ["pt", "np"]: + input_values = processor(self._generate_random_audio_data(), return_tensors=input_type) + ov_outputs = ov_model(**input_values) + + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + + # compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + + gc.collect() + + +class OVModelForAudioXVectorIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = [ + "data2vec_audio", + "unispeech_sat", + "wavlm", + "wav2vec2-hf", + "wav2vec2-conformer", + ] + + def _generate_random_audio_data(self): + np.random.seed(10) + t = np.linspace(0, 5.0, int(5.0 * 22050), endpoint=False) + # generate pure sine wave at 220 Hz + audio_data = 0.5 * np.sin(2 * np.pi * 220 * t) + return audio_data + + def test_load_vanilla_transformers_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = OVModelForAudioXVector.from_pretrained(MODEL_NAMES["t5"], export=True) + + self.assertIn("Unrecognized configuration class", str(context.exception)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForAudioXVector.from_pretrained(model_id, export=True) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + set_seed(SEED) + transformers_model = AutoModelForAudioXVector.from_pretrained(model_id) + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(self._generate_random_audio_data(), return_tensors="pt") + + with torch.no_grad(): + transformers_outputs = transformers_model(**input_values) + for input_type in ["pt", "np"]: + input_values = processor(self._generate_random_audio_data(), return_tensors=input_type) + ov_outputs = ov_model(**input_values) + + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + + # compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + self.assertTrue( + torch.allclose(torch.Tensor(ov_outputs.embeddings), transformers_outputs.embeddings, atol=1e-4) + ) + + gc.collect() + + +class OVModelForAudioFrameClassificationIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = [ + "data2vec_audio", + "unispeech_sat", + "wavlm", + "wav2vec2-hf", + "wav2vec2-conformer", + ] + + def _generate_random_audio_data(self): + np.random.seed(10) + t = np.linspace(0, 5.0, int(5.0 * 22050), endpoint=False) + # generate pure sine wave at 220 Hz + audio_data = 0.5 * np.sin(2 * np.pi * 220 * t) + return audio_data + + def test_load_vanilla_transformers_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = OVModelForAudioFrameClassification.from_pretrained(MODEL_NAMES["t5"], export=True) + + self.assertIn("Unrecognized configuration class", str(context.exception)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForAudioFrameClassification.from_pretrained(model_id, export=True) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + set_seed(SEED) + transformers_model = AutoModelForAudioFrameClassification.from_pretrained(model_id) + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(self._generate_random_audio_data(), return_tensors="pt") + + with torch.no_grad(): + transformers_outputs = transformers_model(**input_values) + for input_type in ["pt", "np"]: + input_values = processor(self._generate_random_audio_data(), return_tensors=input_type) + ov_outputs = ov_model(**input_values) + + self.assertTrue("logits" in ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + + # compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + + gc.collect() diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 6a14e96796..eeb751153a 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -78,6 +78,7 @@ "vit": "hf-internal-testing/tiny-random-vit", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "wav2vec2-hf": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", "xlm": "hf-internal-testing/tiny-random-xlm", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta",