openvinotoolkit · Wovchena · Nov 1, 2024 · Nov 4, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
@@ -362,6 +362,17 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
         </ul>
       </td>
     </tr>
+    <tr>
+      <td><code>Phi3VForCausalLM</code></td>
+      <td>phi3_v</td>
+      <td>Not supported</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"><code>microsoft/Phi-3-vision-128k-instruct</code></a></li>
+          <li><a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct"><code>microsoft/Phi-3.5-vision-instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
   </tbody>
 </table>
 

diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp
@@ -12,7 +12,7 @@ static float clip_lerp(float s, float e, float t) {
 }
 
 // Bilinear resize function
-static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
     dst.nx = target_width;
     dst.ny = target_height;
     dst.buf.resize(3 * target_width * target_height);

diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp
@@ -31,6 +31,7 @@ struct clip_image_f32 {
 };
 
 void bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height);
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height);
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
 clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -65,6 +65,7 @@ class InputsEmbedder {
     friend class InputsEmbedderLLaVA;
     friend class InputsEmbedderLLaVANext;
     friend class InputsEmbedderInternVLChat;
+    friend class InputsEmbedderPhi3V;
 };
 
 } // namespace ov::genai
diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp
@@ -41,4 +41,8 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     if (parsed.contains("image_grid_pinpoints")) {
         image_grid_pinpoints = parsed.at("image_grid_pinpoints").get<std::vector<std::pair<int, int>>>();
     }
+    read_json_param(parsed, "num_crops", phi3_v.num_crops);
+    if (parsed.contains("img_processor")) {
+        phi3_v.num_img_tokens = parsed.at("img_processor").at("num_img_tokens");
+    }
 }
diff --git a/src/cpp/src/visual_language/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp
@@ -35,16 +35,22 @@ class ProcessorConfig {
     /// llava calls it image_std.
     std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};
 
-    // llava specific config params
+    // A renamed version of norm_mean.
     std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
     std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
+    // llava specific config params
     size_t crop_size_height = 336;
     size_t crop_size_width = 336;
     size_t size_shortest_edge = 336;
 
     // llava-next specific config params
     std::vector<std::pair<int, int>> image_grid_pinpoints{{336, 672}, {672, 336}, {672, 672}, {1008, 336}, {336, 1008}};
 
+    struct {
+        size_t num_crops = 4;
+        size_t num_img_tokens = 144;
+    } phi3_v;
+
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.

diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -644,8 +644,204 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon
     }
     return output_tensor;
 }
+
+namespace phi3_v {
+constexpr size_t INPUT_IMAGE_SIZE = 336;
+
+ov::Tensor padding_336(const ov::Tensor& unpadded) {
+    ov::Shape _1ss3 = unpadded.get_shape();
+    size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2);
+    if (s1 < s2) {
+        size_t tar = size_t(std::ceil(float(s1) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE);
+        size_t top_padding = (tar - s1) / 2;
+        ov::Tensor padded{ov::element::u8, {1, tar, s2, 3}};
+        uint8_t* padded_data = padded.data<uint8_t>();
+        std::fill_n(padded_data, padded.get_size(), 255);
+        std::copy_n(unpadded.data<uint8_t>(), unpadded.get_size(), padded_data + top_padding * s2 * 3);
+        return padded;
+    }
+    size_t tar = size_t(std::ceil(float(s2) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE);
+    size_t left_padding = (tar - s2) / 2;
+    ov::Tensor padded{ov::element::u8, {1, s1, tar, 3}};
+    uint8_t* padded_data = padded.data<uint8_t>();
+    std::fill_n(padded_data, padded.get_size(), 255);
+    uint8_t* unpadded_data = unpadded.data<uint8_t>();
+    for (size_t row = 0; row < s1; ++row) {
+        std::copy_n(unpadded_data + row * s2 * 3, s2 * 3, padded_data + row * tar * 3 + left_padding * 3);
+    }
+    return padded;
+}
+
+ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
+    ov::Shape _1hwc = uint8.get_shape();
+    size_t height = _1hwc.at(1), width = _1hwc.at(2);
+    bool trans = false;
+    if (width < height) {
+        std::swap(height, width);
+        trans = true;
+    }
+    float ratio = float(width) / height;
+    unsigned scale = 1;
+    while (scale * std::ceil(scale / ratio) <= num_crops) {
+        ++scale;
+    }
+    --scale;
+    size_t new_w = scale * INPUT_IMAGE_SIZE;
+    size_t new_h = new_w / ratio;
+    clip_image_u8 src{}, dst{};
+    uint8_t* uint8_data = uint8.data<uint8_t>();
+    if (trans) {
+        src = clip_image_u8{int(height), int(width), {uint8_data, uint8_data + uint8.get_size()}};
+        bilinear_resize(src, dst, new_h, new_w);
+        return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()});
+    }
+    src = clip_image_u8{int(width), int(height), {uint8_data, uint8_data + uint8.get_size()}};
+    bilinear_resize(src, dst, new_w, new_h);
+    return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
+}
+
+ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
+    uint8_t* uint_8_data = uint8.data<uint8_t>();
+    ov::Tensor float_normalized{ov::element::f32, uint8.get_shape()};
+    float* float_data = float_normalized.data<float>();
+    OPENVINO_ASSERT(0 == uint8.get_size() % 3, "RGB");
+    for (size_t idx = 0; idx < uint8.get_size(); idx += 3) {
+        float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.image_mean[0]) / config.image_std[0];
+        float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.image_mean[1]) / config.image_std[1];
+        float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.image_mean[2]) / config.image_std[2];
+    }
+    return float_normalized;
+}
+
+ov::Tensor channels_first(const ov::Tensor& _1hw3) {
+    ov::Shape shape = _1hw3.get_shape();
+    ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, shape.at(1), shape.at(2)}};
+    float* _1hw3_data = _1hw3.data<float>();
+    float* _13hw_data = _13hw.data<float>();
+    for (size_t plane = 0; plane < 3; ++plane) {
+        for (size_t row = 0; row < shape.at(1); ++row) {
+            for (size_t col = 0; col < shape.at(2); ++col) {
+                _13hw_data[plane * shape.at(1) * shape.at(2) + row * shape.at(2) + col] = _1hw3_data[row * shape.at(2) * 3 + col * 3 + plane];
+            }
+        }
+    }
+    return _13hw;
+}
+
+// Reimplementation of Python im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336)
+ov::Tensor slice_image(const ov::Tensor& image) {
+    ov::Shape shape = image.get_shape();
+    size_t N = shape[0];
+    size_t C = shape[1];
+    size_t H = shape[2];
+    size_t W = shape[3];
+
+    size_t num_h_slices = H / INPUT_IMAGE_SIZE;
+    size_t num_w_slices = W / INPUT_IMAGE_SIZE;
+
+    // Step 1: Define and populate the reshaped tensor in the correct shape order
+    ov::Tensor reshaped{ov::element::f32, {N, num_h_slices, num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}};
+    float* reshaped_data = reshaped.data<float>();
+    float* image_data = image.data<float>();
+
+    // Populate the reshaped tensor
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t h = 0; h < num_h_slices; ++h) {
+            for (size_t w = 0; w < num_w_slices; ++w) {
+                for (size_t c = 0; c < C; ++c) {
+                    for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) {
+                        for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) {
+                            size_t src_idx = n * C * H * W + c * H * W + (h * INPUT_IMAGE_SIZE + i) * W + (w * INPUT_IMAGE_SIZE + j);
+                            size_t dst_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            reshaped_data[dst_idx] = image_data[src_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Step 2: Define the permuted tensor in the final shape
+    ov::Tensor permuted{ov::element::f32, {N * num_h_slices * num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}};
+    float* permuted_data = permuted.data<float>();
+
+    // Perform permutation by flattening N, num_h_slices, and num_w_slices
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t h = 0; h < num_h_slices; ++h) {
+            for (size_t w = 0; w < num_w_slices; ++w) {
+                for (size_t c = 0; c < C; ++c) {
+                    for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) {
+                        for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) {
+                            size_t src_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            size_t dst_idx = (n * num_h_slices * num_w_slices + h * num_w_slices + w) * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            permuted_data[dst_idx] = reshaped_data[src_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return permuted;
 }
 
+ov::Tensor concatenate_batch(const ov::Tensor& float_first, const ov::Tensor& float_second) {
+    ov::Shape shape_first = float_first.get_shape();
+    ov::Shape shape_second = float_second.get_shape();
+    OPENVINO_ASSERT(shape_first.at(1) == shape_second.at(1), "Channels must be the same");
+    OPENVINO_ASSERT(shape_first.at(2) == shape_second.at(2), "Height must be the same");
+    OPENVINO_ASSERT(shape_first.at(3) == shape_second.at(3), "Width must be the same");
+    ov::Tensor concatenated{ov::element::f32, {shape_first.at(0) + shape_second.at(0), shape_first.at(1), shape_first.at(2), shape_first.at(3)}};
+    float* concatenated_data = concatenated.data<float>();
+    float* first_data = float_first.data<float>();
+    float* second_data = float_second.data<float>();
+    std::copy(first_data, first_data + float_first.get_size(), concatenated_data);
+    std::copy(second_data, second_data + float_second.get_size(), concatenated_data + float_first.get_size());
+    return concatenated;
+}
+
+ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) {
+    ov::Shape shape = nchw.get_shape();
+    size_t num_crops = shape[0];
+    if (num_crops >= max_crops) {
+        return nchw;
+    }
+    ov::Tensor padded{ov::element::f32, {max_crops, shape[1], shape[2], shape[3]}};
+    float* padded_data = padded.data<float>();
+    float* nchw_data = nchw.data<float>();
+    std::copy_n(nchw_data, nchw.get_size(), padded_data);
+    return padded;
+}
+
+std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
+    ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
+    ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
+    clip_image_u8 img{int(hd_image.get_shape().at(2)), int(hd_image.get_shape().at(1)), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
+    clip_image_u8 dst;
+    bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
+    ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()};
+    global_image = mean_scale(global_image, config);
+    hd_image = mean_scale(hd_image, config);
+    global_image = channels_first(global_image);
+    hd_image = channels_first(hd_image);
+    ov::Tensor slices = slice_image(hd_image);
+    ov::Tensor concatenated = concatenate_batch(global_image, slices);
+    ov::Tensor pixel_values = pad_to_max_num_crops_tensor(concatenated, config.phi3_v.num_crops);
+    return {std::move(pixel_values), image_size};
+}
+}  // namespace phi3_v
+}  // anonymous namespace
+
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config) :
     model_type(model_type) {
     auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_vision_embeddings_model.xml",
@@ -680,6 +876,8 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
         return encode_llava_next(image, config);
     }  else if (model_type == VLMModelType::INTERNVL_CHAT) {
         return encode_internvl(image, config);
+    }  else if (model_type == VLMModelType::PHI3_V) {
+        return encode_phi3_v(image, config);
     } else {
         OPENVINO_THROW("Unsupported type of VisionEncoder");
     }
@@ -753,3 +951,10 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce
 
     return {std::move(image_features), resized_source_size};
 }
+
+EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
+    const auto& [pixel_values, image_size] = phi3_v::get_pixel_values_phi3_v(image, config);
+    m_vision_encoder.set_input_tensor(pixel_values);
+    m_vision_encoder.infer();
+    return {m_vision_encoder.get_output_tensor(), image_size};
+}
diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp
@@ -158,5 +158,9 @@ class VisionEncoder {
     EncodedImage encode_internvl(
         const ov::Tensor& image, const ProcessorConfig& config
     );
+
+    EncodedImage encode_phi3_v(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
 };
 }
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
@@ -19,4 +19,13 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
 
     // Setting llava_next specific config params
     read_json_param(parsed, "image_newline", image_newline);
+    // phi3_v
+    if (parsed.contains("sub_GN")) {
+        sub_GN = parsed.at("sub_GN").get<std::vector<std::vector<std::vector<std::vector<float>>>>>().at(0).at(0).at(0);
+    }
+    OPENVINO_ASSERT(sub_GN.size() == 4096);
+    if (parsed.contains("glb_GN")) {
+        glb_GN = parsed.at("glb_GN").get<std::vector<std::vector<std::vector<float>>>>().at(0).at(0);
+    }
+    OPENVINO_ASSERT(glb_GN.size() == 4096);
 }
diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp
@@ -54,6 +54,9 @@ class VLMConfig {
     std::string image_context_token = "<IMG_CONTEXT>";
     /// @brief A string token denoting end of image embeddings for InternVL2 model.
     std::string image_end_token = "</img>";
+    /// @brief phi3_v new line token embedding to separate images.
+    std::vector<float> sub_GN = std::vector(4096, 0.0f);
+    std::vector<float> glb_GN = std::vector(4096, 0.0f);
 
     /// @brief Default constructor.
     VLMConfig() = default;

diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp
@@ -16,14 +16,16 @@ enum class VLMModelType {
     LLAVA,
     LLAVA_NEXT,
     INTERNVL_CHAT,
+    PHI3_V,
 };
 
 inline VLMModelType to_vlm_model_type(const std::string& value) {
     static const std::unordered_map<std::string, VLMModelType> model_types_map = {
         {"minicpmv", VLMModelType::MINICPM},
         {"llava", VLMModelType::LLAVA},
         {"llava_next", VLMModelType::LLAVA_NEXT},
-        {"internvl_chat", VLMModelType::INTERNVL_CHAT}
+        {"internvl_chat", VLMModelType::INTERNVL_CHAT},
+        {"phi3_v", VLMModelType::PHI3_V}
     };
 
     auto it = model_types_map.find(value);