From c7e901fd2a6f9c329f5a8d7c434d7ca7b9bf17a7 Mon Sep 17 00:00:00 2001 From: Jorge Turrado Ferrero Date: Mon, 6 May 2024 17:09:27 +0200 Subject: [PATCH] fix: Prometheus metrics expose correct info (#5779) Signed-off-by: Jorge Turrado --- pkg/metricscollector/opentelemetry.go | 4 +- pkg/metricscollector/prommetrics.go | 2 +- .../azureeventgridtopic_test.go | 0 .../opentelemetry_metrics_test.go | 105 +++++++++- .../prometheus_metrics_test.go | 191 +++++++++++++++++- 5 files changed, 288 insertions(+), 14 deletions(-) rename tests/{ => internals}/eventemitter/azureeventgridtopic/azureeventgridtopic_test.go (100%) diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index f64b05d6279..db64edde67d 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -146,7 +146,7 @@ func initMeters() { _, err = meter.Float64ObservableGauge( "keda.scaler.metrics.latency", - api.WithDescription("DEPRECATED - use `keda_scaler_metrics_latency_seconds` instead"), + api.WithDescription("DEPRECATED - use `keda.scaler.metrics.latency.seconds` instead"), api.WithFloat64Callback(ScalerMetricsLatencyCallbackDeprecated), ) if err != nil { @@ -164,7 +164,7 @@ func initMeters() { _, err = meter.Float64ObservableGauge( "keda.internal.scale.loop.latency", - api.WithDescription("DEPRECATED - use `keda_internal_scale_loop_latency_seconds` instead"), + api.WithDescription("DEPRECATED - use `keda.internal.scale.loop.latency.seconds` instead"), api.WithFloat64Callback(ScalableObjectLatencyCallbackDeprecated), ) if err != nil { diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 2cf51f58e14..a5b9bc6c5b8 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -178,7 +178,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", Name: "totals", - Help: fmt.Sprintf("%v use 'keda_resource_handled_total' instead.", bestPracticeDeprecatedMsg), + Help: fmt.Sprintf("%v use 'keda_resource_registered_total' instead.", bestPracticeDeprecatedMsg), }, []string{"type", "namespace"}, ) diff --git a/tests/eventemitter/azureeventgridtopic/azureeventgridtopic_test.go b/tests/internals/eventemitter/azureeventgridtopic/azureeventgridtopic_test.go similarity index 100% rename from tests/eventemitter/azureeventgridtopic/azureeventgridtopic_test.go rename to tests/internals/eventemitter/azureeventgridtopic/azureeventgridtopic_test.go diff --git a/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go b/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go index 06fab9b1fba..be9fafdc46e 100644 --- a/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go +++ b/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "fmt" + "maps" "os/exec" "strings" "testing" @@ -818,6 +819,24 @@ func testScalerMetricLatency(t *testing.T) { } assert.Equal(t, true, found) } + val, ok = family["keda_scaler_metrics_latency_seconds"] + assert.True(t, ok, "keda_scaler_metrics_latency_seconds not available") + if ok { + var found bool + metrics := val.GetMetric() + for _, metric := range metrics { + t.Log("--- latency metric detail info ---", "metric", metric) + labels := metric.GetLabel() + for _, label := range labels { + if (*label.Name == labelScaledObject && *label.Value == scaledObjectName) || + (*label.Name == labelScaledJob && *label.Value == scaledJobName) { + assert.InDelta(t, float64(0), *metric.Gauge.Value, 0.001) + found = true + } + } + } + assert.Equal(t, true, found) + } } func testScalableObjectMetrics(t *testing.T) { @@ -855,6 +874,37 @@ func testScalableObjectMetrics(t *testing.T) { } assert.Equal(t, true, found) } + + val, ok = family["keda_internal_scale_loop_latency_seconds"] + assert.True(t, ok, "keda_internal_scale_loop_latency_seconds not available") + if ok { + var found bool + metrics := val.GetMetric() + + // check scaledobject loop + found = false + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType && *label.Value == "scaledobject" { + found = true + } + } + } + assert.Equal(t, true, found) + + // check scaledjob loop + found = false + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType && *label.Value == "scaledjob" { + found = true + } + } + } + assert.Equal(t, true, found) + } } func testScalerActiveMetric(t *testing.T, kc *kubernetes.Clientset) { @@ -1013,15 +1063,16 @@ func getLatestCommit(t *testing.T) string { return strings.Trim(out.String(), "\n") } -func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]int) { +func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expectedValues map[string]int) { t.Log("--- testing trigger total metrics ---") + expected := map[string]int{} family, ok := families["keda_trigger_totals"] assert.True(t, ok, "keda_trigger_totals not available") if !ok { return } - + maps.Copy(expected, expectedValues) metrics := family.GetMetric() for _, metric := range metrics { labels := metric.GetLabel() @@ -1040,6 +1091,31 @@ func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.Metric } assert.Equal(t, 0, len(expected)) + + family, ok = families["keda_trigger_registered_count"] + assert.True(t, ok, "keda_trigger_registered_count not available") + if !ok { + return + } + maps.Copy(expected, expectedValues) + metrics = family.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType { + triggerType := *label.Value + metricValue := *metric.Gauge.Value + expectedMetricValue := float64(expected[triggerType]) + + assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for trigger type %s", + expectedMetricValue, metricValue, triggerType) + + delete(expected, triggerType) + } + } + } + + assert.Equal(t, 0, len(expected)) } func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]map[string]int) { @@ -1069,6 +1145,31 @@ func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamil assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for cr type %s & namespace %s", expectedMetricValue, metricValue, crType, namespace) } + + family, ok = families["keda_resource_registered_count"] + assert.True(t, ok, "keda_resource_registered_count not available") + if !ok { + return + } + + metrics = family.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + var namespace, crType string + for _, label := range labels { + if *label.Name == labelType { + crType = *label.Value + } else if *label.Name == namespaceString { + namespace = *label.Value + } + } + + metricValue := *metric.Gauge.Value + expectedMetricValue := float64(expected[crType][namespace]) + + assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for cr type %s & namespace %s", + expectedMetricValue, metricValue, crType, namespace) + } } func assertScaledObjectFlagMetric(t *testing.T, families map[string]*prommodel.MetricFamily, scaledObjectName string, metricName string, expected bool) { diff --git a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go index fe55813f701..9593d13d5cf 100644 --- a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go +++ b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "fmt" + "maps" "os/exec" "strings" "testing" @@ -535,8 +536,11 @@ func testScaledObjectErrors(t *testing.T, data templateData) { family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) val, ok := family["keda_scaled_object_errors"] assert.True(t, ok, "keda_scaled_object_errors not available") - if ok { + valTotal, okTotal := family["keda_scaled_object_errors_total"] + assert.True(t, okTotal, "keda_scaled_object_errors_total not available") + if ok && okTotal { errCounterVal1 := getErrorMetricsValue(val) + errCounterValTotal1 := getErrorMetricsValue(valTotal) // wait for 2 seconds as pollinginterval is 2 time.Sleep(2 * time.Second) @@ -544,10 +548,15 @@ func testScaledObjectErrors(t *testing.T, data templateData) { family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) val, ok := family["keda_scaled_object_errors"] assert.True(t, ok, "keda_scaled_object_errors not available") - if ok { + valTotal, okTotal := family["keda_scaled_object_errors_total"] + assert.True(t, okTotal, "keda_scaled_object_errors_total not available") + if ok && okTotal { errCounterVal2 := getErrorMetricsValue(val) + errCounterValTotal2 := getErrorMetricsValue(valTotal) assert.NotEqual(t, errCounterVal2, float64(0)) + assert.NotEqual(t, errCounterValTotal2, float64(0)) assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) + assert.GreaterOrEqual(t, errCounterValTotal2, errCounterValTotal1) } } @@ -567,17 +576,29 @@ func testScaledJobErrors(t *testing.T, data templateData) { time.Sleep(20 * time.Second) family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaled_job_errors"]; ok { + val, ok := family["keda_scaled_job_errors"] + assert.True(t, ok, "keda_scaled_job_errors not available") + valTotal, okTotal := family["keda_scaled_job_errors_total"] + assert.True(t, okTotal, "keda_scaled_job_errors_total not available") + if ok && okTotal { errCounterVal1 := getErrorMetricsValue(val) + errCounterValTotal1 := getErrorMetricsValue(valTotal) // wait for 2 seconds as pollinginterval is 2 time.Sleep(2 * time.Second) family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaled_job_errors"]; ok { + val, ok := family["keda_scaled_job_errors"] + assert.True(t, ok, "keda_scaled_job_errors not available") + valTotal, okTotal := family["keda_scaled_job_errors_total"] + assert.True(t, okTotal, "keda_scaled_job_errors_total not available") + if ok && okTotal { errCounterVal2 := getErrorMetricsValue(val) + errCounterValTotal2 := getErrorMetricsValue(valTotal) assert.NotEqual(t, errCounterVal2, float64(0)) + assert.NotEqual(t, errCounterValTotal2, float64(0)) assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) + assert.GreaterOrEqual(t, errCounterValTotal2, errCounterValTotal1) } else { t.Errorf("metric not available") } @@ -637,8 +658,11 @@ func testScalerErrorsTotal(t *testing.T, data templateData) { family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) val, ok := family["keda_scaler_errors_total"] assert.True(t, ok, "keda_scaler_errors_total not available") - if ok { + valDetail, okDetail := family["keda_scaler_detail_errors_total"] + assert.True(t, okDetail, "keda_scaler_detail_errors_total not available") + if ok && okDetail { errCounterVal1 := getErrorMetricsValue(val) + errCounterValDetail1 := getErrorMetricsValue(valDetail) // wait for 2 seconds as pollinginterval is 2 time.Sleep(2 * time.Second) @@ -646,10 +670,15 @@ func testScalerErrorsTotal(t *testing.T, data templateData) { family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) val, ok := family["keda_scaler_errors_total"] assert.True(t, ok, "keda_scaler_errors_total not available") - if ok { + valDetail, okDetail := family["keda_scaler_detail_errors_total"] + assert.True(t, okDetail, "keda_scaler_detail_errors_total not available") + if ok && okDetail { errCounterVal2 := getErrorMetricsValue(val) + errCounterValDetail2 := getErrorMetricsValue(valDetail) assert.NotEqual(t, errCounterVal2, float64(0)) + assert.NotEqual(t, errCounterValDetail2, float64(0)) assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) + assert.GreaterOrEqual(t, errCounterValDetail2, errCounterValDetail1) } } @@ -665,6 +694,13 @@ func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { for _, metric := range metrics { return metric.GetCounter().GetValue() } + case "keda_scaler_detail_errors_total": + metrics := val.GetMetric() + result := 0. + for _, metric := range metrics { + result += metric.GetCounter().GetValue() + } + return result case "keda_scaled_object_errors": metrics := val.GetMetric() for _, metric := range metrics { @@ -675,6 +711,16 @@ func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { } } } + case "keda_scaled_object_errors_total": + metrics := val.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == "scaledObject" && *label.Value == wrongScaledObjectName { + return *metric.Counter.Value + } + } + } case "keda_scaled_job_errors": metrics := val.GetMetric() for _, metric := range metrics { @@ -685,6 +731,16 @@ func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { } } } + case "keda_scaled_job_errors_total": + metrics := val.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == "scaledJob" && *label.Value == wrongScaledJobName { + return *metric.Counter.Value + } + } + } case "keda_scaler_errors": metrics := val.GetMetric() for _, metric := range metrics { @@ -746,6 +802,24 @@ func testScalerMetricLatency(t *testing.T) { } assert.Equal(t, true, found) } + + val, ok = family["keda_scaler_metrics_latency_seconds"] + assert.True(t, ok, "keda_scaler_metrics_latency_seconds not available") + if ok { + var found bool + metrics := val.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if (*label.Name == labelScaledObject && *label.Value == scaledObjectName) || + (*label.Name == labelScaledJob && *label.Value == scaledJobName) { + assert.InDelta(t, float64(0), *metric.Gauge.Value, 0.001) + found = true + } + } + } + assert.Equal(t, true, found) + } } func testScalableObjectMetrics(t *testing.T) { @@ -769,6 +843,36 @@ func testScalableObjectMetrics(t *testing.T) { } assert.Equal(t, true, found) + // check scaledjob loop + found = false + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType && *label.Value == "scaledjob" { + found = true + } + } + } + assert.Equal(t, true, found) + } else { + t.Errorf("scaledobject metric not available") + } + if val, ok := family["keda_internal_scale_loop_latency_seconds"]; ok { + var found bool + metrics := val.GetMetric() + + // check scaledobject loop + found = false + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType && *label.Value == "scaledobject" { + found = true + } + } + } + assert.Equal(t, true, found) + // check scaledjob loop found = false for _, metric := range metrics { @@ -968,15 +1072,15 @@ func getLatestCommit(t *testing.T) string { return strings.Trim(out.String(), "\n") } -func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]int) { +func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expectedValues map[string]int) { t.Log("--- testing trigger total metrics ---") - + expected := map[string]int{} family, ok := families["keda_trigger_totals"] assert.True(t, ok, "keda_trigger_totals not available") if !ok { return } - + maps.Copy(expected, expectedValues) metrics := family.GetMetric() for _, metric := range metrics { labels := metric.GetLabel() @@ -995,6 +1099,31 @@ func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.Metric } assert.Equal(t, 0, len(expected)) + + family, ok = families["keda_trigger_registered_total"] + assert.True(t, ok, "keda_trigger_registered_total not available") + if !ok { + return + } + maps.Copy(expected, expectedValues) + metrics = family.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == labelType { + triggerType := *label.Value + metricValue := *metric.Gauge.Value + expectedMetricValue := float64(expected[triggerType]) + + assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for trigger type %s", + expectedMetricValue, metricValue, triggerType) + + delete(expected, triggerType) + } + } + } + + assert.Equal(t, 0, len(expected)) } func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]map[string]int) { @@ -1024,6 +1153,31 @@ func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamil assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for cr type %s & namespace %s", expectedMetricValue, metricValue, crType, namespace) } + + family, ok = families["keda_resource_registered_total"] + assert.True(t, ok, "keda_resource_registered_total not available") + if !ok { + return + } + + metrics = family.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + var namespace, crType string + for _, label := range labels { + if *label.Name == labelType { + crType = *label.Value + } else if *label.Name == namespaceString { + namespace = *label.Value + } + } + + metricValue := *metric.Gauge.Value + expectedMetricValue := float64(expected[crType][namespace]) + + assert.Equalf(t, expectedMetricValue, metricValue, "expected %f got %f for cr type %s & namespace %s", + expectedMetricValue, metricValue, crType, namespace) + } } func checkGRPCServerMetrics(t *testing.T, families map[string]*prommodel.MetricFamily) { @@ -1104,6 +1258,25 @@ func checkGRPCServerMetrics(t *testing.T, families map[string]*prommodel.MetricF metricValue += *metric.Counter.Value } assert.GreaterOrEqual(t, metricValue, 1.0, "keda_internal_metricsservice_grpc_server_msg_sent_total has to be greater than 0") + + family, ok = families["keda_internal_metricsservice_grpc_server_handling_seconds"] + if !ok { + t.Errorf("metric keda_internal_metricsservice_grpc_server_handling_seconds not available") + return + } + + metricValue = 0.0 + metrics = family.GetMetric() + for _, metric := range metrics { + labels := metric.GetLabel() + for _, label := range labels { + if *label.Name == namespaceString && *label.Value != testNamespace { + continue + } + } + metricValue += *metric.Histogram.SampleSum + } + assert.Greater(t, metricValue, 0.0, "keda_internal_metricsservice_grpc_server_handling_seconds has to be greater than 0") } func checkGRPCClientMetrics(t *testing.T, families map[string]*prommodel.MetricFamily) {