diff --git a/vllm/config.py b/vllm/config.py index 67a4ec0761cc3..df892ca7936d8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -968,6 +968,11 @@ class SchedulerConfig: policy: The scheduling policy to use. "fcfs" (default) or "priority". use_padding_aware_scheduling: If True, scheduler will consider padded tokens in prefill. + max_padding_ratio: Prevents scheduling batches with large paddings. + 0.2 means that up to 20% tokens in batch can be dedicated + to padding, and the remaining 80% must be usable data + Requires padding-aware scheduling. + Must be in range (0, 1). Not set by default """ def __init__(self, @@ -986,7 +991,8 @@ def __init__(self, multi_step_stream_outputs: bool = False, send_delta_data: bool = False, policy: str = "fcfs", - use_padding_aware_scheduling=False) -> None: + use_padding_aware_scheduling=False, + max_padding_ratio=None) -> None: if max_num_batched_tokens is None: if enable_chunked_prefill: if num_scheduler_steps > 1: @@ -1038,6 +1044,7 @@ def __init__(self, self.send_delta_data = send_delta_data self.policy = policy self.use_padding_aware_scheduling = use_padding_aware_scheduling + self.max_padding_ratio = max_padding_ratio self._verify_args() def _verify_args(self) -> None: @@ -1072,6 +1079,10 @@ def _verify_args(self) -> None: and not self.use_padding_aware_scheduling: raise ValueError("max_num_prefill_seqs can be only " "used with padding-aware-scheduling. ") + if self.max_padding_ratio is not None \ + and not self.use_padding_aware_scheduling: + raise ValueError("max_padding_ratio can be only " + "used with padding-aware-scheduling. ") if self.use_padding_aware_scheduling and self.chunked_prefill_enabled: raise ValueError("Padding-aware scheduling currently " "does not work with chunked prefill ") diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 1c69c72933b79..1608c13aa7b40 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -105,6 +105,7 @@ def num_curr_seqs(self): @dataclass class PaddingAwareSchedulingBudget(SchedulingBudget): max_num_prefill_seqs: Optional[int] = None + max_padding_ratio: Optional[int] = None _prefill_request_ids_max_seq_lens: Dict[str, int] = field(default_factory=dict) _max_seq_len: int = 0 @@ -179,6 +180,14 @@ def can_schedule(self, if self.max_num_prefill_seqs is not None and result: result = self._num_curr_prefill_seqs + num_new_seqs \ <= self.max_num_prefill_seqs + if self._num_curr_prefill_seqs != 0 and self.max_padding_ratio is not None and result: + num_tokens = self.num_batched_tokens + num_new_tokens + padding_ratio = 1 - num_tokens / num_new_padded_tokens + result = padding_ratio < self.max_padding_ratio + if not result: + print( + f"[PaddingAwareSchedulerDebug] CANNOT schedule, exceeded max padding ratio {self.max_padding_ratio} (num_tokens = {num_tokens}, num_padded_tokens = {num_new_padded_tokens}, ratio = {padding_ratio:.2f})" # noqa: E501 + ) return result @property @@ -1099,8 +1108,9 @@ def _schedule_default(self) -> SchedulerOutputs: budget = PaddingAwareSchedulingBudget( token_budget=self.scheduler_config.max_num_batched_tokens, max_num_seqs=self.scheduler_config.max_num_seqs, - max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs - ) + max_num_prefill_seqs=self.scheduler_config. + max_num_prefill_seqs, + max_padding_ratio=self.scheduler_config.max_padding_ratio) else: budget = SchedulingBudget( token_budget=self.scheduler_config.max_num_batched_tokens, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cdf1401816800..012f24162e114 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -114,6 +114,7 @@ class EngineArgs: disable_sliding_window: bool = False use_v2_block_manager: bool = True use_padding_aware_scheduling: bool = False + max_padding_ratio: Optional[float] = None swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 @@ -400,6 +401,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help=('Use padding-aware scheduling. If True, the scheduler ' 'will consider padded tokens in prefill. ' 'By default this is set to False. ')) + parser.add_argument( + '--max-padding-ratio', + default=EngineArgs.max_padding_ratio, + action='store_true', + help=('Prevents scheduling batches with large paddings.' + '0.2 means that up to 20% tokens in batch can be dedicated' + 'to padding, and the remaining 80% must be usable data' + 'Requires padding-aware scheduling. ' + 'Must be in range (0, 1). Not set by default')) parser.add_argument( '--num-lookahead-slots', type=int, @@ -1066,7 +1076,8 @@ def create_engine_config(self) -> EngineConfig: send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, - use_padding_aware_scheduling=self.use_padding_aware_scheduling) + use_padding_aware_scheduling=self.use_padding_aware_scheduling, + max_padding_ratio=self.max_padding_ratio) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras,