From a4392a9cdbbd347772d9d4e5d48ebc97ab3e3465 Mon Sep 17 00:00:00 2001 From: Roman Doroschevici Date: Wed, 4 Dec 2024 11:46:36 +0000 Subject: [PATCH] Merge "fix runbook functionality to properly detect pod IP exhaustion and node IP exhaustion" -- Branch commit log -- commit dbfdff3eac39b4f5af58cc47d1c6baa71bd11168 Author: Roman Doroschevici Date: 2024-12-03T12:16:50Z fix runbook functionality to properly detect pod IP exhaustion and node IP exhaustion fix 382018864 Change-Id: I3e7365c4a75998d47c2ddf1a1976ef52d402d23d GitOrigin-RevId: 1e816830e179fb6b2c503bcc038606226b4f4bbe --- gcpdiag/queries/gke.py | 12 ++ gcpdiag/runbook/gke/ip_exhaustion.py | 145 +++++++++++------- .../runbook/gke/snapshots/ip_exhaustion.txt | 7 +- .../runbook/gke/templates/ipexhaustion.jinja | 3 +- .../steps/gke/node-ip-range-exhaustion.md | 3 +- 5 files changed, 111 insertions(+), 59 deletions(-) diff --git a/gcpdiag/queries/gke.py b/gcpdiag/queries/gke.py index e2df65216..3a822dbb3 100644 --- a/gcpdiag/queries/gke.py +++ b/gcpdiag/queries/gke.py @@ -383,6 +383,18 @@ def subnetwork(self) -> Optional[models.Resource]: raise RuntimeError("can't parse network string: %s" % subnetwork_string) return network.get_subnetwork(m.group(1), m.group(2), m.group(3)) + @property + def get_subnet_name(self) -> Optional[models.Resource]: + if 'subnetwork' not in self._resource_data: + return None + return self._resource_data['subnetwork'] + + @property + def get_nodepool_config(self) -> Optional[models.Resource]: + if 'nodePools' not in self._resource_data: + return None + return self._resource_data['nodePools'] + @property def is_private(self) -> bool: if not 'privateClusterConfig' in self._resource_data: diff --git a/gcpdiag/runbook/gke/ip_exhaustion.py b/gcpdiag/runbook/gke/ip_exhaustion.py index cbd6abf0a..a6a943f5c 100644 --- a/gcpdiag/runbook/gke/ip_exhaustion.py +++ b/gcpdiag/runbook/gke/ip_exhaustion.py @@ -117,106 +117,143 @@ def execute(self): op.get(flags.PROJECT_ID))) -class PodIpRangeExhaustion(runbook.Step): - """Check Pod IP Range Exhaustion and offer remediation. +class NodeIpRangeExhaustion(runbook.Step): + """Check Node IP Range Exhaustion and offer remediation. - Checks Pod IP range exhaustion and offers remediation step. + Checks Node IP range exhaustion and offers remediation step. """ - template = 'ipexhaustion::pod_ip_exhaustion' + template = 'ipexhaustion::node_ip_exhaustion' + # max number of characters from the cluster name that will end up in the node name + MAX_GKE_NAME_LENGTH = 16 def execute(self): - """Checking Pod IP Exhaustion and offering remediation steps""" + """Checking node IP Exhaustion and offering remediation steps""" cluster = gke.get_cluster(op.get(flags.PROJECT_ID), cluster_id=op.get(flags.NAME), location=op.get(flags.LOCATION)) - project = op.get(flags.PROJECT_ID) location = op.get(flags.LOCATION) name = op.get(flags.NAME) + error_msg = 'IP_SPACE_EXHAUSTED' + node_subnet = cluster.get_subnet_name + # adding fltr variable, because otherwise the filter string will be longer than 100 chars + fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName' - # Check for exhaustion of pod range - pod_range_exhausted_error = 'GKE_IP_UTILIZATION_POD_RANGES_ALLOCATION_HIGH' - # Define the query strings to be used to search cloud logging. + # using here ':' instead of '=' for 'protoPayload.status.message' because there could be + # status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED' filter_str = [ - 'log_id("networkanalyzer.googleapis.com/analyzer_reports")', - f'jsonPayload.causeCode="{pod_range_exhausted_error}"', - f'jsonPayload.resourceName:"//container.googleapis.com/projects/{project}"', - f'jsonPayload.resourceName:"clusters/{name}"', - f'jsonPayload.resourceName:"{location}"' + 'log_id("cloudaudit.googleapis.com/activity")', + 'protoPayload.methodName="v1.compute.instances.insert"', + 'resource.type="gce_instance"', 'severity=ERROR', + f'protoPayload.status.message:"{error_msg}"', + f'protoPayload.resourceName:"{location}"', + f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"', + f'{fltr}="{node_subnet}"' ] filter_str = '\n'.join(filter_str) - ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str) + # Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging. + op.info(f'Searching cloud logging for the string {error_msg} ' + 'which indicates IP Exhaustion issue') + ip_space_exhausted_log_entries = local_realtime_query(filter_str) - if ip_space_exhausted_pod_range_log_entries: + # Check cloud log entries for IP exhaustion. + if ip_space_exhausted_log_entries: op.info( - 'Verifying if the cluster is an Autopilot cluster or a Standard cluster...' + f'{error_msg} error found for cluster {name} and subnet {node_subnet}' ) - - if cluster.is_autopilot: - op.info('Cluster is an Autopilot cluster') - op.add_failed(cluster, - reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name), - remediation=op.prep_msg(op.FAILURE_REMEDIATION)) - else: - op.info('Cluster is a standard cluster') - op.add_failed(cluster, - reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name), - remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1)) + op.add_failed(cluster, + reason=op.prep_msg(op.FAILURE_REASON, + cluster_name=name, + node_subnet=node_subnet), + remediation=op.prep_msg(op.FAILURE_REMEDIATION, + node_subnet=node_subnet)) else: - op.add_ok( - cluster, - reason= - (f'No Pod IP exhaustion issues found for cluster {name} in the project {project}' - )) + op.add_ok(cluster, + reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name)) -class NodeIpRangeExhaustion(runbook.Step): - """Check Node IP Range Exhaustion and offer remediation. +class PodIpRangeExhaustion(runbook.Step): + """Check Pod IP Range Exhaustion and offer remediation. - Checks Node IP range exhaustion and offers remediation step. + Checks Pod IP range exhaustion and offers remediation step. """ - template = 'ipexhaustion::node_ip_exhaustion' - # max number of characters from the cluster name that will end up in the node name + template = 'ipexhaustion::pod_ip_exhaustion' + MAX_GKE_SUBNET_NAME_LENGTH = 15 MAX_GKE_NAME_LENGTH = 16 def execute(self): - """Checking node IP Exhaustion and offering remediation steps""" + """Checking Pod IP Exhaustion and offering remediation steps""" cluster = gke.get_cluster(op.get(flags.PROJECT_ID), cluster_id=op.get(flags.NAME), location=op.get(flags.LOCATION)) + project = op.get(flags.PROJECT_ID) location = op.get(flags.LOCATION) name = op.get(flags.NAME) error_msg = 'IP_SPACE_EXHAUSTED' - + node_subnet = cluster.get_subnet_name + pool_config = cluster.get_nodepool_config + pool_nr = len(pool_config) + # adding fltr variable, because otherwise the filter string will be longer than 100 chars + fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName' + + # using here ':' instead of '=' for 'protoPayload.status.message' because there could be + # status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED' filter_str = [ 'log_id("cloudaudit.googleapis.com/activity")', 'protoPayload.methodName="v1.compute.instances.insert"', 'resource.type="gce_instance"', 'severity=ERROR', - f'protoPayload.status.message="{error_msg}"', + f'protoPayload.status.message:"{error_msg}"', f'protoPayload.resourceName:"{location}"', f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"' ] + + # creating a pod_ranges list from a set to exclude duplicates + pod_ranges = list( + {pool_config[i]['networkConfig']['podRange'] for i in range(pool_nr)}) + # add opening parenthesis to the query + filter_str.append('(') + + # create the filter for all subnets for all nodepools + for pod_range in pod_ranges: + resource = (f'{node_subnet[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-' + f'{pod_range[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-') + + # if it's the last element, don't add OR to the filter + if pod_range == pod_ranges[-1]: + filter_str.append(f'{fltr}:"{resource}"') + else: + filter_str.append(f'{fltr}:"{resource}" OR') + + # add closing parenthesis to the query + filter_str.append(')') filter_str = '\n'.join(filter_str) + ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str) - # Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging. - op.info(f'Searching cloud logging for the string {error_msg} ' - 'which indicates IP Exhaustion issue') - ip_space_exhausted_log_entries = local_realtime_query(filter_str) + if ip_space_exhausted_pod_range_log_entries: + op.info( + 'Verifying if the cluster is an Autopilot cluster or a Standard cluster...' + ) - # Check cloud log entries for IP exhaustion. - if ip_space_exhausted_log_entries: - op.info(f'log entries with {error_msg} found in cloud logging for ' - f'the cluster {name}') - op.add_failed(cluster, - reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name), - remediation=op.prep_msg(op.FAILURE_REMEDIATION)) + if cluster.is_autopilot: + op.info('Cluster is an Autopilot cluster') + op.add_failed(cluster, + reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name), + remediation=op.prep_msg(op.FAILURE_REMEDIATION)) + else: + op.info('Cluster is a Standard cluster') + op.add_failed(cluster, + reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name), + remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1)) else: - op.add_ok(cluster, - reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name)) + op.add_ok( + cluster, + reason= + (f'No Pod IP exhaustion issues found for cluster {name} in the project {project}' + )) class IpExhaustionEnd(runbook.EndStep): diff --git a/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt b/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt index 6a6ccb02f..95f80c9a9 100644 --- a/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt +++ b/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt @@ -18,21 +18,22 @@ gke/ip-exhaustion: Troubleshooting ip exhaustion issues on GKE clusters. Cluster cluster-1 found in us-central1-c for project gcpdiag-gke3-gggg [AUTOMATED STEP]: Checking node IP Exhaustion and offering remediation steps [INFO]: Searching cloud logging for the string IP_SPACE_EXHAUSTED which indicates IP Exhaustion issue -[INFO]: log entries with IP_SPACE_EXHAUSTED found in cloud logging for the cluster cluster-1 +[INFO]: IP_SPACE_EXHAUSTED error found for cluster cluster-1 and subnet public-subnet - gcpdiag-gke3-gggg/us-central1-c/cluster-1 [FAIL] [REASON] - Node IP exhaustion is detected in the cluster cluster-1 + Node IP exhaustion is detected in the cluster cluster-1 for the subnet public-subnet [REMEDIATION] Please follow the below documentation [1] to expand the ip range of the node subnet. + The subnet that has exhausted its IP space is public-subnet. [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet [AUTOMATED STEP]: Checking Pod IP Exhaustion and offering remediation steps [INFO]: Verifying if the cluster is an Autopilot cluster or a Standard cluster... -[INFO]: Cluster is a standard cluster +[INFO]: Cluster is a Standard cluster - gcpdiag-gke3-gggg/us-central1-c/cluster-1 [FAIL] [REASON] diff --git a/gcpdiag/runbook/gke/templates/ipexhaustion.jinja b/gcpdiag/runbook/gke/templates/ipexhaustion.jinja index b2d6cc9a5..a2577257d 100644 --- a/gcpdiag/runbook/gke/templates/ipexhaustion.jinja +++ b/gcpdiag/runbook/gke/templates/ipexhaustion.jinja @@ -30,12 +30,13 @@ No Node IP exhaustion detected in the cluster {cluster_name} {% endblock node_ip_exhaustion_success_reason %} {% block node_ip_exhaustion_failure_reason %} -Node IP exhaustion is detected in the cluster {cluster_name} +Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet} {% endblock node_ip_exhaustion_failure_reason %} {% block node_ip_exhaustion_failure_remediation %} Please follow the below documentation [1] to expand the ip range of the node subnet. +The subnet that has exhausted its IP space is {node_subnet}. [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet {% endblock node_ip_exhaustion_failure_remediation %} diff --git a/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md b/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md index db7c0cc3f..949371a9a 100644 --- a/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md +++ b/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md @@ -16,12 +16,13 @@ Checks Node IP range exhaustion and offers remediation step. ### Failure Reason -Node IP exhaustion is detected in the cluster {cluster_name} +Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet} ### Failure Remediation Please follow the below documentation [1] to expand the ip range of the node subnet. +The subnet that has exhausted its IP space is {node_subnet}. [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet