Skip to content

Commit

Permalink
Permit no-gpu pods if gpu usage is overcommitted
Browse files Browse the repository at this point in the history
  • Loading branch information
shouhanzen committed Mar 7, 2024
1 parent 468188b commit b0d721e
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 18 deletions.
Empty file added ref.txt
Empty file.
41 changes: 23 additions & 18 deletions src/dsmlp/app/gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,42 @@


class GPUValidator(ComponentValidator):

def __init__(self, kube: KubeClient, logger: Logger) -> None:
self.kube = kube
self.logger = logger

def validate_pod(self, request: Request):
"""
Validate pods for namespaces with the 'k8s-sync' label
"""

# Low priority pods pass through
priority = request.object.spec.priorityClassName
if priority is not None and priority == LOW_PRIORITY_CLASS:
return

namespace = self.kube.get_namespace(request.namespace)
curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)

utilized_gpus = 0
for container in request.object.spec.containers:
requested, limit = 0, 0
try:
requested = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass
try:
limit = int(container.resources.limits[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass

utilized_gpus += max(requested, limit)

requested, limit = 0, 0
try:
requested = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass
try:
limit = int(container.resources.limits[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass

utilized_gpus += max(requested, limit)

# Short circuit if no GPUs requested (permits overcap)
if utilized_gpus == 0:
return

if utilized_gpus + curr_gpus > namespace.gpu_quota:
raise ValidationFailure(f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
raise ValidationFailure(
f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
7 changes: 7 additions & 0 deletions tests/app/test_gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ def test_limit_exceeded(self):
gen_request(gpu_lim=6), expected=False, message="GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 10 would be exceeded."
)

# Tests pod overcap
def test_low_priority_overcap(self):
self.kube_client.set_existing_gpus('user10', 11)

self.try_validate(
gen_request(), expected=True)

def try_validate(self, json, expected: bool, message: str = None):
try_val_with_component(GPUValidator(
self.kube_client, self.logger), json, expected, message)

0 comments on commit b0d721e

Please sign in to comment.