add input preprocessing

huggingface · Nov 14, 2024 · 1aaac07 · 1aaac07
1 parent 0a3041f
commit 1aaac07
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 3 deletions.
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1809,8 +1809,8 @@ def preprocess_inputs(
             raise ValueError("Tokenizer is required.")
         if image is not None and processor is None:
             raise ValueError("Processor is required.")
-        text_content = f"<image>\n{text}" if image is not None else text
-        messages = [{"role": "user", "content": text_content}]
+        text = f"<image>\n{text}" if image is not None else text
+        messages = [{"role": "user", "content": text}]
         if tokenizer.chat_template is not None:
             text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         if image is not None:
@@ -1957,6 +1957,23 @@ def get_multimodal_embeddings(
 
         return inputs_embeds, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if image is not None and "<|image_1|>" not in text:
+            text = "<|image_1|>\n" + text
+        if getattr(processor.tokenizer, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": text}]
+            text = processor.tokenizer.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        inputs = processor(images=image, text=text, return_tensors="pt")
+        return inputs
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -1883,7 +1883,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"]
     REMOTE_CODE_MODELS = ["minicpmv", "nanollava", "phi3_v"]
     TASK = "image-text-to-text"
-    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"]
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"]
 
     IMAGE = Image.open(
         requests.get(