From b978df7527fc092cf63bf58bc0ce030d36915a50 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 29 Oct 2024 20:54:19 +0400 Subject: [PATCH] fix images processing --- .../intel/openvino/modeling_visual_language.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index c89bc56346..0b487a8524 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -214,7 +214,17 @@ def forward(self, image_feature, pos_embed, key_padding_mask): return result -MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler} +class OVVisionProjection(OVModelPart): + _model_name = "vision_projection" + + def forward(self, img_features): + return self.request(img_features)[0] + + +MODEL_PARTS_CLS_MAPPING = { + "resampler": OVResampler, + "vision_projection": OVVisionProjection +} class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin): @@ -1665,12 +1675,10 @@ def get_multimodal_embeddings( input_ids = input_ids.clamp_min(0).clamp_max(self.config.vocab_size) inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids, **kwargs)) if has_image: - vision_embeds = self.get_vision_embeddings( - pixel_values, input_ids=input_ids, image_sizes=image_sizes, **kwargs - ) + vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, image_sizes=image_sizes, **kwargs) image_features_proj = torch.from_numpy(vision_embeds) inputs_embeds = inputs_embeds.index_put(positions, image_features_proj, accumulate=False) - + return inputs_embeds, attention_mask, position_ids MODEL_TYPE_TO_CLS_MAPPING = {