From 5dac4f0e2be7a9e77bd71139c1bcf8b56d63016b Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Fri, 14 Jul 2023 14:39:46 +0100 Subject: [PATCH 1/4] Create both local and central ServiceMonitor --- controllers/monitor.go | 107 ++++++++++++++++++++++ controllers/trustyaiservice_controller.go | 80 ++-------------- 2 files changed, 115 insertions(+), 72 deletions(-) create mode 100644 controllers/monitor.go diff --git a/controllers/monitor.go b/controllers/monitor.go new file mode 100644 index 0000000..7fac57f --- /dev/null +++ b/controllers/monitor.go @@ -0,0 +1,107 @@ +package controllers + +import ( + "context" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +func generateServiceMonitorSpec(deploymentNamespace string, monitoredNamespace string, serviceName string) *monitoringv1.ServiceMonitor { + serviceMonitor := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceMonitorName, + Namespace: deploymentNamespace, + Labels: map[string]string{ + "modelmesh-service": "modelmesh-serving", + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{monitoredNamespace}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Interval: "4s", + Path: "/q/metrics", + HonorLabels: true, + Scheme: "http", + Params: map[string][]string{ + "match[]": { + `{__name__= "trustyai_spd"}`, + `{__name__= "trustyai_dir"}`, + }, + }, + MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ + { + Action: "keep", + Regex: "trustyai_.*", + SourceLabels: []monitoringv1.LabelName{"__name__"}, + }, + }, + }, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/name": serviceName, + }, + }, + }, + } + return serviceMonitor +} + +func (r *TrustyAIServiceReconciler) ensureServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context, isLocal bool) error { + var deploymentNamespace string + if isLocal { + deploymentNamespace = cr.Namespace + } else { + deploymentNamespace = r.Namespace + } + + serviceMonitor := generateServiceMonitorSpec(deploymentNamespace, cr.Namespace, cr.Name) + + // Set TrustyAIService instance as the owner and controller + err := ctrl.SetControllerReference(cr, serviceMonitor, r.Scheme) + if err != nil { + return err + } + + // Check if this ServiceMonitor already exists + found := &monitoringv1.ServiceMonitor{} + err = r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) + if err != nil { + if errors.IsNotFound(err) { + var logMessage string + if isLocal { + logMessage = "Creating a new local ServiceMonitor" + } else { + logMessage = "Creating a new central ServiceMonitor" + } + + log.FromContext(ctx).Info(logMessage, "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + err = r.Create(ctx, serviceMonitor) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to create ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } else { + log.FromContext(ctx).Error(err, "Couldn't create new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } + + return nil +} + +func (r *TrustyAIServiceReconciler) ensureLocalServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { + return r.ensureServiceMonitor(cr, ctx, true) +} + +func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { + return r.ensureServiceMonitor(cr, ctx, false) +} diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index 5db1069..e1d18a5 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -21,7 +21,6 @@ import ( goerrors "errors" "fmt" kserveapi "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -226,8 +225,14 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } - // Service Monitor - err = r.reconcileServiceMonitor(instance, ctx) + // Local Service Monitor + err = r.ensureLocalServiceMonitor(instance, ctx) + if err != nil { + return ctrl.Result{}, err + } + + // Central Service Monitor + err = r.ensureCentralServiceMonitor(instance, ctx) if err != nil { return ctrl.Result{}, err } @@ -285,75 +290,6 @@ func (r *TrustyAIServiceReconciler) reconcileService(cr *trustyaiopendatahubiov1 return service, nil } -func (r *TrustyAIServiceReconciler) reconcileServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { - - serviceMonitor := &monitoringv1.ServiceMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: serviceMonitorName, - Namespace: cr.Namespace, - Labels: map[string]string{ - "modelmesh-service": "modelmesh-serving", - }, - }, - Spec: monitoringv1.ServiceMonitorSpec{ - NamespaceSelector: monitoringv1.NamespaceSelector{ - MatchNames: []string{cr.Namespace}, - }, - Endpoints: []monitoringv1.Endpoint{ - { - Interval: "4s", - Path: "/q/metrics", - HonorLabels: true, - Scheme: "http", - Params: map[string][]string{ - "match[]": { - `{__name__= "trustyai_spd"}`, - `{__name__= "trustyai_dir"}`, - }, - }, - MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ - { - Action: "keep", - Regex: "trustyai_.*", - SourceLabels: []monitoringv1.LabelName{"__name__"}, - }, - }, - }, - }, - Selector: metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app.kubernetes.io/name": cr.Name, - }, - }, - }, - } - - // Set TrustyAIService instance as the owner and controller - err := ctrl.SetControllerReference(cr, serviceMonitor, r.Scheme) - if err != nil { - return err - } - - // Check if this ServiceMonitor already exists - found := &monitoringv1.ServiceMonitor{} - err = r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) - if err != nil { - if errors.IsNotFound(err) { - log.FromContext(ctx).Info("Creating a new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - err = r.Create(ctx, serviceMonitor) - if err != nil { - log.FromContext(ctx).Error(err, "Not found ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - return err - } - } else { - log.FromContext(ctx).Error(err, "Couldn't create new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - return err - } - } - - return nil -} - // SetupWithManager sets up the controller with the Manager. func (r *TrustyAIServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { // Watch ServingRuntime objects (not managed by this controller) From 0e3dc20dc2123263810bc06f40e273ffea5978aa Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Fri, 14 Jul 2023 15:53:46 +0100 Subject: [PATCH 2/4] Change match labels and namespaces watched by ServiceMonitors --- controllers/monitor.go | 118 +++++++++++++++++----- controllers/trustyaiservice_controller.go | 5 +- 2 files changed, 93 insertions(+), 30 deletions(-) diff --git a/controllers/monitor.go b/controllers/monitor.go index 7fac57f..cc8bc09 100644 --- a/controllers/monitor.go +++ b/controllers/monitor.go @@ -11,7 +11,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -func generateServiceMonitorSpec(deploymentNamespace string, monitoredNamespace string, serviceName string) *monitoringv1.ServiceMonitor { +var isCentralServiceMonitorCreated = false + +// generateServiceMonitorSpecCentral generates the ServiceMonitor spec for central ServiceMonitor +func generateServiceMonitorSpecCentral(deploymentNamespace string) *monitoringv1.ServiceMonitor { serviceMonitor := &monitoringv1.ServiceMonitor{ ObjectMeta: metav1.ObjectMeta{ Name: serviceMonitorName, @@ -22,7 +25,7 @@ func generateServiceMonitorSpec(deploymentNamespace string, monitoredNamespace s }, Spec: monitoringv1.ServiceMonitorSpec{ NamespaceSelector: monitoringv1.NamespaceSelector{ - MatchNames: []string{monitoredNamespace}, + Any: true, }, Endpoints: []monitoringv1.Endpoint{ { @@ -47,7 +50,7 @@ func generateServiceMonitorSpec(deploymentNamespace string, monitoredNamespace s }, Selector: metav1.LabelSelector{ MatchLabels: map[string]string{ - "app.kubernetes.io/name": serviceName, + "app.kubernetes.io/part-of": serviceType, }, }, }, @@ -55,15 +58,89 @@ func generateServiceMonitorSpec(deploymentNamespace string, monitoredNamespace s return serviceMonitor } -func (r *TrustyAIServiceReconciler) ensureServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context, isLocal bool) error { - var deploymentNamespace string - if isLocal { - deploymentNamespace = cr.Namespace +// ensureCentralServiceMonitor ensures that the central ServiceMonitor is created +func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(ctx context.Context) error { + // Create only one central ServiceMonitor + if isCentralServiceMonitorCreated { + return nil + } + + serviceMonitor := generateServiceMonitorSpecCentral(r.Namespace) + + // Check if this ServiceMonitor already exists + found := &monitoringv1.ServiceMonitor{} + err := r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) + if err != nil { + if errors.IsNotFound(err) { + log.FromContext(ctx).Info("Creating a new central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + err = r.Create(ctx, serviceMonitor) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to create central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } else { + // Set the global variable to true after ServiceMonitor is successfully created + isCentralServiceMonitorCreated = true + } + } else { + log.FromContext(ctx).Error(err, "Failed to get central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } } else { - deploymentNamespace = r.Namespace + // Already exists, set the global variable to true + isCentralServiceMonitorCreated = true } - serviceMonitor := generateServiceMonitorSpec(deploymentNamespace, cr.Namespace, cr.Name) + return nil +} + +// generateServiceMonitorSpecLocal generates the ServiceMonitor spec for a local ServiceMonitor +func generateServiceMonitorSpecLocal(deploymentNamespace string, serviceName string) *monitoringv1.ServiceMonitor { + serviceMonitor := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: deploymentNamespace, + Labels: map[string]string{ + "modelmesh-service": "modelmesh-serving", + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{deploymentNamespace}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Interval: "4s", + Path: "/q/metrics", + HonorLabels: true, + Scheme: "http", + Params: map[string][]string{ + "match[]": { + `{__name__= "trustyai_spd"}`, + `{__name__= "trustyai_dir"}`, + }, + }, + MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ + { + Action: "keep", + Regex: "trustyai_.*", + SourceLabels: []monitoringv1.LabelName{"__name__"}, + }, + }, + }, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/part-of": serviceType, + }, + }, + }, + } + return serviceMonitor +} + +// ensureLocalServiceMonitor ensures that the local ServiceMonitor is created +func (r *TrustyAIServiceReconciler) ensureLocalServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { + serviceMonitor := generateServiceMonitorSpecLocal(cr.Namespace, cr.Name) // Set TrustyAIService instance as the owner and controller err := ctrl.SetControllerReference(cr, serviceMonitor, r.Scheme) @@ -71,37 +148,22 @@ func (r *TrustyAIServiceReconciler) ensureServiceMonitor(cr *trustyaiopendatahub return err } - // Check if this ServiceMonitor already exists + // Check if the ServiceMonitor already exists found := &monitoringv1.ServiceMonitor{} err = r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) if err != nil { if errors.IsNotFound(err) { - var logMessage string - if isLocal { - logMessage = "Creating a new local ServiceMonitor" - } else { - logMessage = "Creating a new central ServiceMonitor" - } - - log.FromContext(ctx).Info(logMessage, "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + log.FromContext(ctx).Info("Creating a new local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) err = r.Create(ctx, serviceMonitor) if err != nil { - log.FromContext(ctx).Error(err, "Failed to create ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + log.FromContext(ctx).Error(err, "Failed to create local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) return err } } else { - log.FromContext(ctx).Error(err, "Couldn't create new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + log.FromContext(ctx).Error(err, "Failed to get local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) return err } } return nil } - -func (r *TrustyAIServiceReconciler) ensureLocalServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { - return r.ensureServiceMonitor(cr, ctx, true) -} - -func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { - return r.ensureServiceMonitor(cr, ctx, false) -} diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index e1d18a5..018300f 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -48,6 +48,7 @@ const ( modelMeshLabelKey = "modelmesh-service" modelMeshLabelValue = "modelmesh-serving" volumeMountName = "volume" + serviceType = "trustyai-service" ) // TrustyAIServiceReconciler reconciles a TrustyAIService object @@ -79,7 +80,7 @@ func getCommonLabels(serviceName string) map[string]string { "app": serviceName, "app.kubernetes.io/name": serviceName, "app.kubernetes.io/instance": serviceName, - "app.kubernetes.io/part-of": serviceName, + "app.kubernetes.io/part-of": serviceType, "app.kubernetes.io/version": "0.1.0", } } @@ -232,7 +233,7 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ } // Central Service Monitor - err = r.ensureCentralServiceMonitor(instance, ctx) + err = r.ensureCentralServiceMonitor(ctx) if err != nil { return ctrl.Result{}, err } From b4aaf5cd717b06cc15708e5ba521c27863f1a2fa Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Fri, 14 Jul 2023 16:16:36 +0100 Subject: [PATCH 3/4] Fix tests --- controllers/suite_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index e8c997c..89e0990 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -243,7 +243,7 @@ var _ = Describe("TrustyAI operator", func() { Expect(deployment.Labels["app"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(name)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(name)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(serviceType)) Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal("0.1.0")) Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) @@ -298,7 +298,7 @@ var _ = Describe("TrustyAI operator", func() { Expect(deployment.Labels["app"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(name)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(name)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(serviceType)) Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal("0.1.0")) Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) From 44d818b9115fabd4f3e65f243ec07b8ea2a39c4e Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 17 Jul 2023 10:34:19 +0100 Subject: [PATCH 4/4] Get central ServiceMonitor and create/update as needed Remove the boolean flag --- controllers/monitor.go | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/controllers/monitor.go b/controllers/monitor.go index cc8bc09..cb25bbd 100644 --- a/controllers/monitor.go +++ b/controllers/monitor.go @@ -11,8 +11,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -var isCentralServiceMonitorCreated = false - // generateServiceMonitorSpecCentral generates the ServiceMonitor spec for central ServiceMonitor func generateServiceMonitorSpecCentral(deploymentNamespace string) *monitoringv1.ServiceMonitor { serviceMonitor := &monitoringv1.ServiceMonitor{ @@ -60,11 +58,6 @@ func generateServiceMonitorSpecCentral(deploymentNamespace string) *monitoringv1 // ensureCentralServiceMonitor ensures that the central ServiceMonitor is created func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(ctx context.Context) error { - // Create only one central ServiceMonitor - if isCentralServiceMonitorCreated { - return nil - } - serviceMonitor := generateServiceMonitorSpecCentral(r.Namespace) // Check if this ServiceMonitor already exists @@ -77,17 +70,11 @@ func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(ctx context.Cont if err != nil { log.FromContext(ctx).Error(err, "Failed to create central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) return err - } else { - // Set the global variable to true after ServiceMonitor is successfully created - isCentralServiceMonitorCreated = true } } else { log.FromContext(ctx).Error(err, "Failed to get central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) return err } - } else { - // Already exists, set the global variable to true - isCentralServiceMonitorCreated = true } return nil