Skip to content

Commit

Permalink
Add support for OCP 4.16
Browse files Browse the repository at this point in the history
  • Loading branch information
Debakel Orakel committed Oct 15, 2024
1 parent 4d39bbd commit a9bbcbe
Show file tree
Hide file tree
Showing 28 changed files with 3,358 additions and 109 deletions.
4 changes: 2 additions & 2 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ parameters:
manifests_version: release-4.15
# no release branches newer than 4.9 exist
=_operator_lifecycle_manager_map:
release-4.13: release-4.9
release-4.14: release-4.9
release-4.15: release-4.9
release-4.16: release-4.9
jsonnetfile_parameters:
cmo_version: ${openshift4_monitoring:manifests_version}
etcd_version: ${openshift4_monitoring:manifests_version}
Expand Down Expand Up @@ -205,9 +205,9 @@ parameters:
NodeMemoryMajorPagesFaults:
# Only alert for >100*cores major page faults/node instead of >500/node
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100)
release-4.13: {}
release-4.14: {}
release-4.15: {}
release-4.16: {}
# Alerts to ignore for user workload monitoring
ignoreUserWorkload: []

Expand Down
34 changes: 21 additions & 13 deletions class/openshift4-monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ parameters:
openshift4_monitoring:
=_manifest_urls:
kube-apiserver:
release-4.13:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.14:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml
Expand All @@ -14,21 +10,28 @@ parameters:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.16:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/kube-apiserver-slos-basic.yaml

machine-api-operator:
release-4.13:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.14:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.15:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.16:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.16/install/0000_90_machine-api-operator_04_alertrules.yaml

machine-config-operator:
release-4.14:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.14/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
release-4.15:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.15/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
release-4.16:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.16/install/0000_90_machine-config_01_prometheus-rules.yaml

ovn-kubernetes:
release-4.13:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.14:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We handle the gotemplate stuff in Jsonnet for now, since Jinja
Expand All @@ -47,11 +50,16 @@ parameters:
# The only templates that are in the alerting rules can be handled
# with a simple string replace.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.16:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml

cloud-credential-operator:
release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml
release-4.16: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.16/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml


kapitan:
Expand Down Expand Up @@ -103,7 +111,7 @@ parameters:
source: ${openshift4_monitoring:_manifest_urls:machine-api-operator:${openshift4_monitoring:manifests_version}:prometheus}
output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-api-operator.yaml
- type: https
source: https://raw.githubusercontent.com/openshift/machine-config-operator/${openshift4_monitoring:manifests_version}/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
source: ${openshift4_monitoring:_manifest_urls:machine-config-operator:${openshift4_monitoring:manifests_version}:prometheus}
output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-config-operator.yaml
- type: https
source: https://raw.githubusercontent.com/operator-framework/operator-lifecycle-manager/${openshift4_monitoring:_operator_lifecycle_manager_map:${openshift4_monitoring:manifests_version}}/manifests/0000_90_olm_01-prometheus-rule.yaml
Expand Down
4 changes: 2 additions & 2 deletions docs/modules/ROOT/pages/references/parameters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parent key for all of the following parameters is `openshift4_monitoring`.

[horizontal]
type:: string
default:: `release-4.14`
default:: `release-4.16`

Select which version of the upstream alerting (and recording) rules should be used by the component.
This parameter must be changed to match the cluster's OCP4 minor version.
Expand Down Expand Up @@ -480,7 +480,7 @@ patchRules:
PrometheusRemoteWriteBehind:
annotations:
runbook_url: https://example.com/runbooks/PrometheusRemoteWriteBehind.html
release-4.14:
release-4.16:
SystemMemoryExceedsReservation:
for: 30m
----
Expand Down
4 changes: 2 additions & 2 deletions tests/custom-rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ parameters:
name: patch-sa

openshift4_monitoring:
manifests_version: release-4.13
manifests_version: release-4.16

customNodeExporter:
enabled: true
Expand All @@ -25,7 +25,7 @@ parameters:
labels:
foo: foo
generic: patch
release-4.13:
release-4.16:
HighOverallControlPlaneMemory:
labels:
foo: bar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ spec:
syn_component: openshift4-monitoring
expr: |
(
max by(namespace, statefulset) (
max by(namespace, statefulset, job, cluster) (
kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
Expand Down Expand Up @@ -1040,7 +1040,7 @@ spec:
}} of its incoming requests.
syn_component: openshift4-monitoring
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ spec:
syn_component: openshift4-monitoring
expr: |
(
max by(namespace, statefulset) (
max by(namespace, statefulset, job, cluster) (
kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
Expand Down Expand Up @@ -1040,7 +1040,7 @@ spec:
}} of its incoming requests.
syn_component: openshift4-monitoring
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ metadata:
labels:
app.kubernetes.io/part-of: openshift4-monitoring
name: appuio-node-exporter
namespace: openshift-monitoring
rules:
- apiGroups:
- authentication.k8s.io
Expand Down Expand Up @@ -33,7 +32,6 @@ metadata:
labels:
app.kubernetes.io/part-of: openshift4-monitoring
name: appuio-node-exporter
namespace: openshift-monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
Expand All @@ -59,6 +57,7 @@ spec:
metadata:
annotations:
kubectl.kubernetes.io/default-container: appuio-node-exporter
openshift.io/required-scc: node-exporter
labels:
app.kubernetes.io/managed-by: cluster-monitoring-operator
app.kubernetes.io/part-of: openshift4-monitoring
Expand Down Expand Up @@ -152,6 +151,9 @@ spec:
fi
echo "ts=$(date -Iseconds) num_cpus=$NUM_CPUS gomaxprocs=$GOMAXPROCS"
exec /bin/node_exporter "$0" "$@"
env:
- name: DBUS_SYSTEM_BUS_ADDRESS
value: unix:path=/host/root/var/run/dbus/system_bus_socket
image: quay.io/prometheus/node-exporter:v1.8.2
name: appuio-node-exporter
resources:
Expand All @@ -162,7 +164,6 @@ spec:
cpu: 8m
memory: 32Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
Expand All @@ -177,7 +178,6 @@ spec:
readOnly: true
workingDir: /var/node_exporter/textfile
- args:
- --logtostderr
- --secure-listen-address=[$(IP)]:9199
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9199/
Expand Down Expand Up @@ -209,7 +209,8 @@ spec:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
terminationMessagePolicy: FallbackToLogsOnError
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /etc/tls/private
name: node-exporter-tls
Expand Down Expand Up @@ -240,7 +241,6 @@ spec:
securityContext:
privileged: true
runAsUser: 0
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /var/node_exporter/textfile
name: node-exporter-textfile
Expand Down Expand Up @@ -306,6 +306,8 @@ apiVersion: v1
kind: Service
metadata:
annotations:
openshift.io/description: Expose the `/metrics` endpoint on port 9199. This port
is for internal use, and no other usage is guaranteed.
service.beta.openshift.io/serving-cert-secret-name: appuio-node-exporter-tls
labels:
app.kubernetes.io/part-of: openshift4-monitoring
Expand Down
Loading

0 comments on commit a9bbcbe

Please sign in to comment.