Skip to content

Commit

Permalink
Enable job suspend for Kueue
Browse files Browse the repository at this point in the history
Signed-off-by: ted chang <[email protected]>
  • Loading branch information
tedhtchang committed Oct 14, 2024
1 parent ab6bc98 commit 734fcb0
Show file tree
Hide file tree
Showing 7 changed files with 1,353 additions and 1,403 deletions.
7 changes: 6 additions & 1 deletion api/lmes/v1alpha1/lmevaljob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// Represent a job's status
// +kubebuilder:validation:Enum=New;Scheduled;Running;Complete;Cancelled
// +kubebuilder:validation:Enum=New;Scheduled;Running;Complete;Cancelled;Suspended
type JobState string

const (
Expand All @@ -42,6 +42,8 @@ const (
CompleteJobState JobState = "Complete"
// The job is cancelled
CancelledJobState JobState = "Cancelled"
// The job is suspended
SuspendedJobState JobState = "Suspended"
)

// +kubebuilder:validation:Enum=NoReason;Succeeded;Failed;Cancelled
Expand Down Expand Up @@ -236,6 +238,9 @@ type LMEvalJobSpec struct {
// Specify extra information for the lm-eval job's pod
// +optional
Pod *LMEvalPodSpec `json:"pod,omitempty"`
// Suspend keeps the job but without pods. This is intended to be used by the Kueue integration
// +optional
Suspend bool `json:"suspend,omitempty"`
}

// LMEvalJobStatus defines the observed state of LMEvalJob
Expand Down
1 change: 0 additions & 1 deletion api/lmes/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion api/tas/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2,602 changes: 1,285 additions & 1,317 deletions config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml

Large diffs are not rendered by default.

20 changes: 12 additions & 8 deletions config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
creationTimestamp: null
controller-gen.kubebuilder.io/version: v0.16.3
name: trustyaiservices.trustyai.opendatahub.io
spec:
group: trustyai.opendatahub.io
Expand All @@ -21,14 +20,19 @@ spec:
description: TrustyAIService is the Schema for the trustyaiservices API
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
Expand Down
82 changes: 7 additions & 75 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
creationTimestamp: null
name: manager-role
rules:
- apiGroups:
- ""
resources:
- configmaps
- persistentvolumeclaims
- pods
- secrets
- services
verbs:
- create
- delete
Expand All @@ -28,14 +31,10 @@ rules:
- apiGroups:
- ""
resources:
- pods
- persistentvolumes
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
Expand All @@ -47,18 +46,6 @@ rules:
- get
- list
- watch
- apiGroups:
- ""
resources:
- secrets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
Expand Down Expand Up @@ -104,38 +91,6 @@ rules:
- create
- get
- update
- apiGroups:
- ""
resources:
- persistentvolumeclaims
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- persistentvolumes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- services
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- monitoring.coreos.com
resources:
Expand Down Expand Up @@ -212,31 +167,6 @@ rules:
- trustyai.opendatahub.io
resources:
- lmevaljobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- trustyai.opendatahub.io
resources:
- lmevaljobs/finalizers
verbs:
- update
- apiGroups:
- trustyai.opendatahub.io
resources:
- lmevaljobs/status
verbs:
- get
- patch
- update
- apiGroups:
- trustyai.opendatahub.io
resources:
- trustyaiservices
verbs:
- create
Expand All @@ -249,12 +179,14 @@ rules:
- apiGroups:
- trustyai.opendatahub.io
resources:
- lmevaljobs/finalizers
- trustyaiservices/finalizers
verbs:
- update
- apiGroups:
- trustyai.opendatahub.io
resources:
- lmevaljobs/status
- trustyaiservices/status
verbs:
- get
Expand Down
43 changes: 43 additions & 0 deletions controllers/lmes/lmevaljob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
job.Status.State = lmesv1alpha1.NewJobState
}

if job.Spec.Suspend {
r.handleSuspend(ctx, log, job)
}

// Handle the job based on its state
switch job.Status.State {
case lmesv1alpha1.NewJobState:
Expand All @@ -198,6 +202,10 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return r.handleComplete(ctx, log, job)
case lmesv1alpha1.CancelledJobState:
return r.handleCancel(ctx, log, job)
case lmesv1alpha1.SuspendedJobState:
if !job.Spec.Suspend {
return r.handleResume(ctx, log, job)
}
}

return ctrl.Result{}, nil
Expand Down Expand Up @@ -634,6 +642,41 @@ func (r *LMEvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger,
return ctrl.Result{}, err
}

func (r *LMEvalJobReconciler) handleSuspend(ctx context.Context, log logr.Logger, job *lmesv1alpha1.LMEvalJob) (ctrl.Result, error) {
defer r.pullingJobs.remove(string(job.GetUID()))
if job.Status.State != lmesv1alpha1.NewJobState {
log.Info("Suspend job")
if err := r.deleteJobPod(ctx, job); err != nil && client.IgnoreNotFound(err) != nil {
log.Error(err, "failed to delete pod for suspended job")
return r.pullingJobs.addOrUpdate(string(job.GetUID()), r.options.PodCheckingInterval), nil
}
} else {
log.Info("Create job in suspend state.")
}
job.Status.State = lmesv1alpha1.SuspendedJobState
err := r.Status().Update(ctx, job)
if err != nil {
log.Error(err, "failed to update job status to suspended")
}

return ctrl.Result{}, err
}

func (r *LMEvalJobReconciler) handleResume(ctx context.Context, log logr.Logger, job *lmesv1alpha1.LMEvalJob) (ctrl.Result, error) {
log.Info("Resume job")
pod := r.createPod(job, log)
if err := r.Create(ctx, pod); err != nil {
log.Error(err, "failed to create pod to resume job")
return r.pullingJobs.addOrUpdate(string(job.GetUID()), r.options.PodCheckingInterval), nil
}
job.Status.State = lmesv1alpha1.ScheduledJobState
err := r.Status().Update(ctx, job)
if err != nil {
log.Error(err, "failed to update job status to scheduled")
}
return ctrl.Result{}, err
}

func (r *LMEvalJobReconciler) validateCustomCard(job *lmesv1alpha1.LMEvalJob, log logr.Logger) error {
if job.Spec.TaskList.TaskRecipes == nil {
return nil
Expand Down

0 comments on commit 734fcb0

Please sign in to comment.