From 684b4debc293b598546d76cbe41b9ad0c99d4be2 Mon Sep 17 00:00:00 2001 From: Siddharth Venkatesan Date: Fri, 20 Dec 2024 11:55:49 -0800 Subject: [PATCH] update unit tests --- .../properties_manager/vllm_rb_properties.py | 39 ++++-- .../tests/test_properties_manager.py | 117 ++++++++++++++++-- 2 files changed, 138 insertions(+), 18 deletions(-) diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index e972c9fb3..0d5c70cc8 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -21,8 +21,11 @@ from djl_python.properties_manager.properties import Properties DTYPE_MAPPER = { + "float32": "float32", "fp32": "float32", + "float16": "float16", "fp16": "float16", + "bfloat16": "bfloat16", "bf16": "bfloat16", "auto": "auto" } @@ -34,15 +37,21 @@ class VllmRbProperties(Properties): tensor_parallel_degree: int = 1 pipeline_parallel_degree: int = 1 # The following configs have different names in DJL compared to vLLM, either is accepted - quantize: Optional[str] = Field(alias="quantization", default=None) + quantize: Optional[str] = Field(alias="quantization", + default=EngineArgs.quantization) max_rolling_batch_prefill_tokens: Optional[int] = Field( - alias="max_num_batched_tokens", default=None) - cpu_offload_gb_per_gpu: Optional[float] = Field(alias="cpu_offload_gb", - default=None) + alias="max_num_batched_tokens", + default=EngineArgs.max_num_batched_tokens) + cpu_offload_gb_per_gpu: float = Field(alias="cpu_offload_gb", + default=EngineArgs.cpu_offload_gb) # The following configs have different defaults, or additional processing in DJL compared to vLLM dtype: str = "auto" max_loras: int = 4 + # the processing for this config via vllm arg parser is broken long_lora_scaling_factors: Optional[Tuple[float, ...]] = None + # this config is broken in the vllm arg parser - the default in EngineArgs is True, but it defaults to False + # via add_cli_args if not explicitly specified. + use_v2_block_manager: bool = True # Neuron vLLM properties device: Optional[str] = None @@ -50,7 +59,7 @@ class VllmRbProperties(Properties): generation_config: Optional[Any] = None # This allows generic vllm engine args to be passed in and set with vllm - model_config = ConfigDict(extra='allow') + model_config = ConfigDict(extra='allow', populate_by_name=True) @field_validator('engine') def validate_engine(cls, engine): @@ -59,6 +68,14 @@ def validate_engine(cls, engine): f"Need python engine to start vLLM RollingBatcher") return engine + @field_validator('dtype') + def validate_dtype(cls, val): + if val not in DTYPE_MAPPER: + raise ValueError( + f"Invalid dtype={val} provided. Must be one of {DTYPE_MAPPER.keys()}" + ) + return DTYPE_MAPPER[val] + @model_validator(mode='after') def validate_pipeline_parallel(self): if self.pipeline_parallel_degree != 1: @@ -67,9 +84,9 @@ def validate_pipeline_parallel(self): ) return self - @field_validator('long_lora_scaling_factors', mode='before') # TODO: processing of this field is broken in vllm via from_cli_args # we should upstream a fix for this to vllm + @field_validator('long_lora_scaling_factors', mode='before') def validate_long_lora_scaling_factors(cls, val): if isinstance(val, str): val = ast.literal_eval(val) @@ -117,20 +134,22 @@ def generate_vllm_engine_arg_dict(self, 'revision': self.revision, 'max_loras': self.max_loras, 'enable_lora': self.enable_lora, + 'trust_remote_code': self.trust_remote_code, + 'cpu_offload_gb': self.cpu_offload_gb_per_gpu, + 'use_v2_block_manager': self.use_v2_block_manager, } if self.quantize is not None: vllm_engine_args['quantization'] = self.quantize if self.max_rolling_batch_prefill_tokens is not None: vllm_engine_args[ 'max_num_batched_tokens'] = self.max_rolling_batch_prefill_tokens - if self.cpu_offload_gb_per_gpu is not None: - vllm_engine_args['cpu_offload_gb'] = self.cpu_offload_gb_per_gpu if self.device is not None: vllm_engine_args['device'] = self.device - if self.preloaded_model is not None: + if self.device == 'neuron': vllm_engine_args['preloaded_model'] = self.preloaded_model - if self.generation_config is not None: vllm_engine_args['generation_config'] = self.generation_config + vllm_engine_args['block_size'] = passthrough_vllm_engine_args.get( + "max_model_len") vllm_engine_args.update(passthrough_vllm_engine_args) return vllm_engine_args diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py index 5647bf88f..371f382f6 100644 --- a/engines/python/setup/djl_python/tests/test_properties_manager.py +++ b/engines/python/setup/djl_python/tests/test_properties_manager.py @@ -423,7 +423,7 @@ def test_hf_error_case(self, params): HuggingFaceProperties(**params) def test_vllm_properties(self): - # test with valid vllm properties + def validate_vllm_config_and_engine_args_match( vllm_config_value, engine_arg_value, @@ -435,7 +435,7 @@ def validate_vllm_config_and_engine_args_match( def test_vllm_default_properties(): required_properties = { "engine": "Python", - "model_id_or_path": "some_model", + "model_id": "some_model", } vllm_configs = VllmRbProperties(**required_properties) engine_args = vllm_configs.get_engine_args() @@ -451,21 +451,119 @@ def test_vllm_default_properties(): vllm_configs.quantize, engine_args.quantization, None) validate_vllm_config_and_engine_args_match( vllm_configs.max_rolling_batch_size, engine_args.max_num_seqs, - HuggingFaceProperties.max_rolling_batch_size) + 32) validate_vllm_config_and_engine_args_match(vllm_configs.dtype, engine_args.dtype, 'auto') validate_vllm_config_and_engine_args_match(vllm_configs.max_loras, engine_args.max_loras, 4) - self.assertEqual(vllm_configs.cpu_offload_gb_per_gpu, None) + validate_vllm_config_and_engine_args_match( + vllm_configs.cpu_offload_gb_per_gpu, + engine_args.cpu_offload_gb, EngineArgs.cpu_offload_gb) self.assertEqual( len(vllm_configs.get_additional_vllm_engine_args()), 0) + def test_invalid_pipeline_parallel(): + properties = { + "engine": "Python", + "model_id": "some_model", + "tensor_parallel_degree": "4", + "pipeline_parallel_degree": "2", + } + with self.assertRaises(ValueError): + _ = VllmRbProperties(**properties) + + def test_invalid_engine(): + properties = { + "engine": "bad_engine", + "model_id": "some_model", + } + with self.assertRaises(ValueError): + _ = VllmRbProperties(**properties) + + def test_aliases(): + properties = { + "engine": "Python", + "model_id": "some_model", + "quantization": "awq", + "max_num_batched_tokens": "546", + "cpu_offload_gb": "7" + } + vllm_configs = VllmRbProperties(**properties) + engine_args = vllm_configs.get_engine_args() + validate_vllm_config_and_engine_args_match( + vllm_configs.quantize, engine_args.quantization, "awq") + validate_vllm_config_and_engine_args_match( + vllm_configs.max_rolling_batch_prefill_tokens, + engine_args.max_num_batched_tokens, 546) + validate_vllm_config_and_engine_args_match( + vllm_configs.cpu_offload_gb_per_gpu, + engine_args.cpu_offload_gb, 7) + + def test_vllm_passthrough_properties(): + properties = { + "engine": "Python", + "model_id": "some_model", + "tensor_parallel_degree": "4", + "pipeline_parallel_degree": "1", + "max_rolling_batch_size": "111", + "quantize": "awq", + "max_rolling_batch_prefill_tokens": "400", + "cpu_offload_gb_per_gpu": "8", + "dtype": "bf16", + "max_loras": "7", + "long_lora_scaling_factors": "1.1, 2.0", + "trust_remote_code": "true", + "max_model_len": "1024", + "enforce_eager": "true", + "enable_chunked_prefill": "False", + "gpu_memory_utilization": "0.4", + } + vllm_configs = VllmRbProperties(**properties) + engine_args = vllm_configs.get_engine_args() + self.assertTrue( + len(vllm_configs.get_additional_vllm_engine_args()) > 0) + validate_vllm_config_and_engine_args_match( + vllm_configs.model_id_or_path, engine_args.model, "some_model") + validate_vllm_config_and_engine_args_match( + vllm_configs.tensor_parallel_degree, + engine_args.tensor_parallel_size, 4) + validate_vllm_config_and_engine_args_match( + vllm_configs.pipeline_parallel_degree, + engine_args.pipeline_parallel_size, 1) + validate_vllm_config_and_engine_args_match( + vllm_configs.max_rolling_batch_size, engine_args.max_num_seqs, + 111) + validate_vllm_config_and_engine_args_match( + vllm_configs.quantize, engine_args.quantization, "awq") + validate_vllm_config_and_engine_args_match( + vllm_configs.max_rolling_batch_prefill_tokens, + engine_args.max_num_batched_tokens, 400) + validate_vllm_config_and_engine_args_match( + vllm_configs.cpu_offload_gb_per_gpu, + engine_args.cpu_offload_gb, 8.0) + validate_vllm_config_and_engine_args_match(vllm_configs.dtype, + engine_args.dtype, + "bfloat16") + validate_vllm_config_and_engine_args_match(vllm_configs.max_loras, + engine_args.max_loras, + 7) + validate_vllm_config_and_engine_args_match( + vllm_configs.long_lora_scaling_factors, + engine_args.long_lora_scaling_factors, (1.1, 2.0)) + validate_vllm_config_and_engine_args_match( + vllm_configs.trust_remote_code, engine_args.trust_remote_code, + True) + self.assertEqual(engine_args.max_model_len, 1024) + self.assertEqual(engine_args.enforce_eager, True) + self.assertEqual(engine_args.enable_chunked_prefill, False) + self.assertEqual(engine_args.gpu_memory_utilization, 0.4) + def test_long_lora_scaling_factors(): properties = { "engine": "Python", - "model_id_or_path": "some_model", + "model_id": "some_model", 'long_lora_scaling_factors': "3.0" } vllm_props = VllmRbProperties(**properties) @@ -500,14 +598,17 @@ def test_long_lora_scaling_factors(): def test_invalid_long_lora_scaling_factors(): properties = { "engine": "Python", - "model_id_or_path": "some_model", + "model_id": "some_model", 'long_lora_scaling_factors': "a,b" } - vllm_props = VllmRbProperties(**properties) with self.assertRaises(ValueError): - vllm_props.get_engine_args() + _ = VllmRbProperties(**properties) test_vllm_default_properties() + test_invalid_pipeline_parallel() + test_invalid_engine() + test_aliases() + test_vllm_passthrough_properties() test_long_lora_scaling_factors() test_invalid_long_lora_scaling_factors()