Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PoC] Add max padding ratio to padding aware scheduler #407

Draft
wants to merge 4 commits into
base: habana_main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,11 @@ class SchedulerConfig:
policy: The scheduling policy to use. "fcfs" (default) or "priority".
use_padding_aware_scheduling: If True, scheduler will consider padded
tokens in prefill.
max_padding_ratio: Prevents scheduling batches with large paddings.
0.2 means that up to 20% tokens in batch can be dedicated
to padding, and the remaining 80% must be usable data
Requires padding-aware scheduling.
Must be in range (0, 1). Not set by default
"""

def __init__(self,
Expand All @@ -986,7 +991,8 @@ def __init__(self,
multi_step_stream_outputs: bool = False,
send_delta_data: bool = False,
policy: str = "fcfs",
use_padding_aware_scheduling=False) -> None:
use_padding_aware_scheduling=False,
max_padding_ratio=None) -> None:
if max_num_batched_tokens is None:
if enable_chunked_prefill:
if num_scheduler_steps > 1:
Expand Down Expand Up @@ -1038,6 +1044,7 @@ def __init__(self,
self.send_delta_data = send_delta_data
self.policy = policy
self.use_padding_aware_scheduling = use_padding_aware_scheduling
self.max_padding_ratio = max_padding_ratio
self._verify_args()

def _verify_args(self) -> None:
Expand Down Expand Up @@ -1072,6 +1079,10 @@ def _verify_args(self) -> None:
and not self.use_padding_aware_scheduling:
raise ValueError("max_num_prefill_seqs can be only "
"used with padding-aware-scheduling. ")
if self.max_padding_ratio is not None \
and not self.use_padding_aware_scheduling:
raise ValueError("max_padding_ratio can be only "
"used with padding-aware-scheduling. ")
if self.use_padding_aware_scheduling and self.chunked_prefill_enabled:
raise ValueError("Padding-aware scheduling currently "
"does not work with chunked prefill ")
Expand Down
14 changes: 12 additions & 2 deletions vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def num_curr_seqs(self):
@dataclass
class PaddingAwareSchedulingBudget(SchedulingBudget):
max_num_prefill_seqs: Optional[int] = None
max_padding_ratio: Optional[int] = None
_prefill_request_ids_max_seq_lens: Dict[str,
int] = field(default_factory=dict)
_max_seq_len: int = 0
Expand Down Expand Up @@ -179,6 +180,14 @@ def can_schedule(self,
if self.max_num_prefill_seqs is not None and result:
result = self._num_curr_prefill_seqs + num_new_seqs \
<= self.max_num_prefill_seqs
if self._num_curr_prefill_seqs != 0 and self.max_padding_ratio is not None and result:
num_tokens = self.num_batched_tokens + num_new_tokens
padding_ratio = 1 - num_tokens / num_new_padded_tokens
result = padding_ratio < self.max_padding_ratio
if not result:
print(
f"[PaddingAwareSchedulerDebug] CANNOT schedule, exceeded max padding ratio {self.max_padding_ratio} (num_tokens = {num_tokens}, num_padded_tokens = {num_new_padded_tokens}, ratio = {padding_ratio:.2f})" # noqa: E501
)
return result

@property
Expand Down Expand Up @@ -1099,8 +1108,9 @@ def _schedule_default(self) -> SchedulerOutputs:
budget = PaddingAwareSchedulingBudget(
token_budget=self.scheduler_config.max_num_batched_tokens,
max_num_seqs=self.scheduler_config.max_num_seqs,
max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs
)
max_num_prefill_seqs=self.scheduler_config.
max_num_prefill_seqs,
max_padding_ratio=self.scheduler_config.max_padding_ratio)
else:
budget = SchedulingBudget(
token_budget=self.scheduler_config.max_num_batched_tokens,
Expand Down
13 changes: 12 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class EngineArgs:
disable_sliding_window: bool = False
use_v2_block_manager: bool = True
use_padding_aware_scheduling: bool = False
max_padding_ratio: Optional[float] = None
swap_space: float = 4 # GiB
cpu_offload_gb: float = 0 # GiB
gpu_memory_utilization: float = 0.90
Expand Down Expand Up @@ -400,6 +401,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help=('Use padding-aware scheduling. If True, the scheduler '
'will consider padded tokens in prefill. '
'By default this is set to False. '))
parser.add_argument(
'--max-padding-ratio',
default=EngineArgs.max_padding_ratio,
action='store_true',
help=('Prevents scheduling batches with large paddings.'
'0.2 means that up to 20% tokens in batch can be dedicated'
'to padding, and the remaining 80% must be usable data'
'Requires padding-aware scheduling. '
'Must be in range (0, 1). Not set by default'))
parser.add_argument(
'--num-lookahead-slots',
type=int,
Expand Down Expand Up @@ -1066,7 +1076,8 @@ def create_engine_config(self) -> EngineConfig:
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
and parallel_config.use_ray),
policy=self.scheduling_policy,
use_padding_aware_scheduling=self.use_padding_aware_scheduling)
use_padding_aware_scheduling=self.use_padding_aware_scheduling,
max_padding_ratio=self.max_padding_ratio)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,
max_loras=self.max_loras,
Expand Down
Loading