Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

phi3 vision #977

Merged
merged 9 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
from .utils import _MAX_UNCOMPRESSED_SIZE, MULTI_MODAL_TEXT_GENERATION_MODELS, clear_class_registry


FORCE_ATTN_MODEL_CLASSES = {"phi3-v": "eager"}

if TYPE_CHECKING:
from optimum.intel.openvino.configuration import OVConfig

Expand Down Expand Up @@ -264,6 +266,10 @@ def main_export(

if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
loading_kwargs["attn_implementation"] = "eager"

# some models force flash_attn attention by default that does not support load model on cpu
if is_transformers_version(">=", "4.36") and model_type in FORCE_ATTN_MODEL_CLASSES:
loading_kwargs["_attn_implementation"] = FORCE_ATTN_MODEL_CLASSES[model_type]
# there are some difference between remote and in library representation of past key values for some models,
# for avoiding confusion we disable remote code for them
if (
Expand Down
17 changes: 16 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,18 @@ def export_from_model(
)

model_name_or_path = model.config._name_or_path
maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
if preprocessors is not None:
# phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk
if is_transformers_version(">=", "4.45") and model_type == "phi3-v" and len(preprocessors) > 1:
if not hasattr(preprocessors[1], "chat_template"):
preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
for processor in preprocessors:
try:
processor.save_pretrained(output)
except Exception as ex:
logger.error(f"Saving {type(processor)} failed with {ex}")
eaidova marked this conversation as resolved.
Show resolved Hide resolved
else:
maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)

files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]

Expand Down Expand Up @@ -891,6 +902,10 @@ def _get_multi_modal_submodels_and_export_configs(
if model_type == "internvl-chat" and preprocessors is not None:
model.config.img_context_token_id = preprocessors[0].convert_tokens_to_ids("<IMG_CONTEXT>")

if model_type == "phi3-v":
model.config.glb_GN = model.model.vision_embed_tokens.glb_GN.tolist()
model.config.sub_GN = model.model.vision_embed_tokens.sub_GN.tolist()

if hasattr(model, "image_newline"):
model.config.image_newline = model.image_newline.tolist()
main_config_cls = TasksManager.get_exporter_config_constructor(
Expand Down
208 changes: 208 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
MPTModelPatcher,
PersimmonModelPatcher,
Phi3ModelPatcher,
Phi3VisionImageEmbeddingsPatcher,
QwenModelPatcher,
RotaryEmbPatcher,
UpdateCausalMaskModelPatcher,
Expand Down Expand Up @@ -2196,3 +2197,210 @@ def patch_model_for_export(
return MiniCPMVResamplerModelPatcher(self, model, model_kwargs)

return super().patch_model_for_export(model, model_kwargs)


class Phi3VisionConfigBehavior(str, enum.Enum):
LANGUAGE = "language"
VISION_PROJECTION = "vision_projection"
VISION_EMBEDDINGS = "vision_embeddings"
TEXT_EMBEDDINGS = "text_embeddings"


class DummyPhi3VisionProjectionInputGenerator(DummyVisionInputGenerator):
SUPPORTED_INPUT_NAMES = ("input",)

def __init__(
self,
task: str,
normalized_config: NormalizedVisionConfig,
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
width: int = 336,
height: int = 336,
**kwargs,
):
self.batch_size = batch_size
self._embed_layer_realization = normalized_config.config.embd_layer["embedding_cls"]
self.image_dim_out = normalized_config.config.img_processor["image_dim_out"]
self.height = height
self.width = width

def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
h = self.height // 336
w = self.width // 336
feat_size = (h * w + 1) * 144 + 1 + (h + 1) * 12
if self._embed_layer_realization == "linear":
shape = [self.batch_size, feat_size, self.image_dim_out]
else:
shape = [self.batch_size, feat_size, self.image_dim_out * 4]
return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)


@register_in_tasks_manager("phi3-v", *["image-text-to-text"], library_name="transformers")
class Phi3VisionOpenVINOConfig(OnnxConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Phi3VisionConfigBehavior]
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")

def __init__(
self,
config: "PretrainedConfig",
task: str = "feature-extraction",
int_dtype: str = "int64",
float_dtype: str = "fp32",
behavior: Phi3VisionConfigBehavior = Phi3VisionConfigBehavior.VISION_EMBEDDINGS,
preprocessors: Optional[List[Any]] = None,
):
super().__init__(
config=config,
task=task,
int_dtype=int_dtype,
float_dtype=float_dtype,
preprocessors=preprocessors,
)
self._behavior = behavior
self._orig_config = config
if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "img_processor"):
self._config = AutoConfig.from_pretrained(
config.img_processor["model_name"], trust_remote_code=True
).vision_config
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION and hasattr(config, "img_processor"):
self._config = config
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyPhi3VisionProjectionInputGenerator,)

@property
def inputs(self) -> Dict[str, Dict[int, str]]:
if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
return {"input": {0: "batch_size", 1: "img_feat_size"}}

@property
def outputs(self) -> Dict[str, Dict[int, str]]:
if self._behavior in [Phi3VisionConfigBehavior.VISION_EMBEDDINGS, Phi3VisionConfigBehavior.VISION_PROJECTION]:
return {"last_hidden_state": {0: "batch_size", 1: "height_width_projection"}}
return {}

def with_behavior(
self,
behavior: Union[str, Phi3VisionConfigBehavior],
):
"""
Creates a config for different behaviour.
Args:
behavior ([`ConfigBehavior`]):
The behavior to use for the new instance.
"""
if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
behavior = Phi3VisionConfigBehavior(behavior)

if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
eaidova marked this conversation as resolved.
Show resolved Hide resolved
model_type = "phi3"
model_type = model_type.replace("_", "-")
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
raise ValueError(
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
)

if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
raise ValueError(
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
)
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
"text-generation-with-past"
]
internal_export_config = internal_export_config_class(
self._orig_config,
use_past=True,
use_past_in_inputs=True,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
)
InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
export_config = InputEmbedOpenvVINOConfig(
self._orig_config,
task="feature-extraction",
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
)
return export_config

if behavior == Phi3VisionConfigBehavior.LANGUAGE:
model_type = "phi3"
model_type = model_type.replace("_", "-")

if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
raise ValueError(
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
)

if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
raise ValueError(
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
)
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
"text-generation-with-past"
]
internal_export_config = internal_export_config_class(
self._orig_config,
use_past=True,
use_past_in_inputs=True,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
)
export_config = LMInputEmbedsConfigHelper(internal_export_config)
export_config._normalized_config = internal_export_config._normalized_config
return export_config

if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
self._orig_config,
task=self.task,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
behavior=behavior,
preprocessors=self._preprocessors,
)
if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
return self.__class__(
self._orig_config,
task=self.task,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
behavior=behavior,
preprocessors=self._preprocessors,
)

def get_model_for_behavior(self, model, behavior: Union[str, Phi3VisionConfigBehavior]):
eaidova marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
behavior = Phi3VisionConfigBehavior(behavior)

if behavior == Phi3VisionConfigBehavior.LANGUAGE:
return model

if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
vision_embeddings = model.model.vision_embed_tokens
vision_embeddings.config = model.config
return vision_embeddings

if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
projection = model.model.vision_embed_tokens.img_projection
projection.config = model.config
return projection

if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
text_embedding = model.model.embed_tokens
text_embedding.config = model.config
return text_embedding

def patch_model_for_export(
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
):
model_kwargs = model_kwargs or {}
if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs)
return super().patch_model_for_export(model, model_kwargs)
21 changes: 21 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,7 @@ def phi3_442_forward(
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, BaseModelOutputWithPast]:
from transformers.cache_utils import Cache, DynamicCache
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
Expand Down Expand Up @@ -3216,3 +3217,23 @@ def forward(self, input):
def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
self._model.forward = self._model.__orig_forward


def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor):
return self.get_img_features(pixel_values)


class Phi3VisionImageEmbeddingsPatcher(ModelPatcher):
def __init__(
self,
config: "OnnxConfig",
model: Union["PreTrainedModel", "TFPreTrainedModel"],
model_kwargs: Dict[str, Any],
):
model.__orig_forward = model.forward
model.forward = types.MethodType(phi3_vision_embeddings_forward, model)
super().__init__(config, model, model_kwargs)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
self._model.forward = self._model.__orig_forward
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def get_submodels(model):
return custom_export, fn_get_submodels


MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]
MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"]


def save_config(config, save_dir):
Expand Down
Loading
Loading