Merge "fix runbook functionality to properly detect pod IP exhaustion…

… and node IP exhaustion" -- Branch commit log -- commit dbfdff3eac39b4f5af58cc47d1c6baa71bd11168 Author: Roman Doroschevici <[email protected]> Date: 2024-12-03T12:16:50Z fix runbook functionality to properly detect pod IP exhaustion and node IP exhaustion fix 382018864 Change-Id: I3e7365c4a75998d47c2ddf1a1976ef52d402d23d GitOrigin-RevId: 1e816830e179fb6b2c503bcc038606226b4f4bbe
GoogleCloudPlatform · Dec 4, 2024 · a4392a9 · a4392a9
1 parent 3b48bd2
commit a4392a9
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 59 deletions.
diff --git a/gcpdiag/queries/gke.py b/gcpdiag/queries/gke.py
@@ -383,6 +383,18 @@ def subnetwork(self) -> Optional[models.Resource]:
       raise RuntimeError("can't parse network string: %s" % subnetwork_string)
     return network.get_subnetwork(m.group(1), m.group(2), m.group(3))
 
+  @property
+  def get_subnet_name(self) -> Optional[models.Resource]:
+    if 'subnetwork' not in self._resource_data:
+      return None
+    return self._resource_data['subnetwork']
+
+  @property
+  def get_nodepool_config(self) -> Optional[models.Resource]:
+    if 'nodePools' not in self._resource_data:
+      return None
+    return self._resource_data['nodePools']
+
   @property
   def is_private(self) -> bool:
     if not 'privateClusterConfig' in self._resource_data:

diff --git a/gcpdiag/runbook/gke/ip_exhaustion.py b/gcpdiag/runbook/gke/ip_exhaustion.py
@@ -117,106 +117,143 @@ def execute(self):
                     op.get(flags.PROJECT_ID)))
 
 
-class PodIpRangeExhaustion(runbook.Step):
-  """Check Pod IP Range Exhaustion and offer remediation.
+class NodeIpRangeExhaustion(runbook.Step):
+  """Check Node IP Range Exhaustion and offer remediation.
 
-  Checks Pod IP range exhaustion and offers remediation step.
+  Checks Node IP range exhaustion and offers remediation step.
   """
 
-  template = 'ipexhaustion::pod_ip_exhaustion'
+  template = 'ipexhaustion::node_ip_exhaustion'
+  # max number of characters from the cluster name that will end up in the node name
+  MAX_GKE_NAME_LENGTH = 16
 
   def execute(self):
-    """Checking Pod IP Exhaustion and offering remediation steps"""
+    """Checking node IP Exhaustion and offering remediation steps"""
 
     cluster = gke.get_cluster(op.get(flags.PROJECT_ID),
                               cluster_id=op.get(flags.NAME),
                               location=op.get(flags.LOCATION))
-    project = op.get(flags.PROJECT_ID)
     location = op.get(flags.LOCATION)
     name = op.get(flags.NAME)
+    error_msg = 'IP_SPACE_EXHAUSTED'
+    node_subnet = cluster.get_subnet_name
+    # adding fltr variable, because otherwise the filter string will be longer than 100 chars
+    fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName'
 
-    # Check for exhaustion of pod range
-    pod_range_exhausted_error = 'GKE_IP_UTILIZATION_POD_RANGES_ALLOCATION_HIGH'
-    # Define the query strings to be used to search cloud logging.
+    # using here ':' instead of '=' for 'protoPayload.status.message' because there could be
+    # status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED'
     filter_str = [
-        'log_id("networkanalyzer.googleapis.com/analyzer_reports")',
-        f'jsonPayload.causeCode="{pod_range_exhausted_error}"',
-        f'jsonPayload.resourceName:"//container.googleapis.com/projects/{project}"',
-        f'jsonPayload.resourceName:"clusters/{name}"',
-        f'jsonPayload.resourceName:"{location}"'
+        'log_id("cloudaudit.googleapis.com/activity")',
+        'protoPayload.methodName="v1.compute.instances.insert"',
+        'resource.type="gce_instance"', 'severity=ERROR',
+        f'protoPayload.status.message:"{error_msg}"',
+        f'protoPayload.resourceName:"{location}"',
+        f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"',
+        f'{fltr}="{node_subnet}"'
     ]
     filter_str = '\n'.join(filter_str)
 
-    ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str)
+    # Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging.
+    op.info(f'Searching cloud logging for the string {error_msg} '
+            'which indicates IP Exhaustion issue')
+    ip_space_exhausted_log_entries = local_realtime_query(filter_str)
 
-    if ip_space_exhausted_pod_range_log_entries:
+    # Check cloud log entries for IP exhaustion.
+    if ip_space_exhausted_log_entries:
       op.info(
-          'Verifying if the cluster is an Autopilot cluster or a Standard cluster...'
+          f'{error_msg} error found for cluster {name} and subnet {node_subnet}'
       )
-
-      if cluster.is_autopilot:
-        op.info('Cluster is an Autopilot cluster')
-        op.add_failed(cluster,
-                      reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
-                      remediation=op.prep_msg(op.FAILURE_REMEDIATION))
-      else:
-        op.info('Cluster is a standard cluster')
-        op.add_failed(cluster,
-                      reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
-                      remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1))
+      op.add_failed(cluster,
+                    reason=op.prep_msg(op.FAILURE_REASON,
+                                       cluster_name=name,
+                                       node_subnet=node_subnet),
+                    remediation=op.prep_msg(op.FAILURE_REMEDIATION,
+                                            node_subnet=node_subnet))
     else:
-      op.add_ok(
-          cluster,
-          reason=
-          (f'No Pod IP exhaustion issues found for cluster {name} in the project {project}'
-          ))
+      op.add_ok(cluster,
+                reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name))
 
 
-class NodeIpRangeExhaustion(runbook.Step):
-  """Check Node IP Range Exhaustion and offer remediation.
+class PodIpRangeExhaustion(runbook.Step):
+  """Check Pod IP Range Exhaustion and offer remediation.
 
-  Checks Node IP range exhaustion and offers remediation step.
+  Checks Pod IP range exhaustion and offers remediation step.
   """
 
-  template = 'ipexhaustion::node_ip_exhaustion'
-  # max number of characters from the cluster name that will end up in the node name
+  template = 'ipexhaustion::pod_ip_exhaustion'
+  MAX_GKE_SUBNET_NAME_LENGTH = 15
   MAX_GKE_NAME_LENGTH = 16
 
   def execute(self):
-    """Checking node IP Exhaustion and offering remediation steps"""
+    """Checking Pod IP Exhaustion and offering remediation steps"""
 
     cluster = gke.get_cluster(op.get(flags.PROJECT_ID),
                               cluster_id=op.get(flags.NAME),
                               location=op.get(flags.LOCATION))
+    project = op.get(flags.PROJECT_ID)
     location = op.get(flags.LOCATION)
     name = op.get(flags.NAME)
     error_msg = 'IP_SPACE_EXHAUSTED'
-
+    node_subnet = cluster.get_subnet_name
+    pool_config = cluster.get_nodepool_config
+    pool_nr = len(pool_config)
+    # adding fltr variable, because otherwise the filter string will be longer than 100 chars
+    fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName'
+
+    # using here ':' instead of '=' for 'protoPayload.status.message' because there could be
+    # status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED'
     filter_str = [
         'log_id("cloudaudit.googleapis.com/activity")',
         'protoPayload.methodName="v1.compute.instances.insert"',
         'resource.type="gce_instance"', 'severity=ERROR',
-        f'protoPayload.status.message="{error_msg}"',
+        f'protoPayload.status.message:"{error_msg}"',
         f'protoPayload.resourceName:"{location}"',
         f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"'
     ]
+
+    # creating a pod_ranges list from a set to exclude duplicates
+    pod_ranges = list(
+        {pool_config[i]['networkConfig']['podRange'] for i in range(pool_nr)})
+    # add opening parenthesis to the query
+    filter_str.append('(')
+
+    # create the filter for all subnets for all nodepools
+    for pod_range in pod_ranges:
+      resource = (f'{node_subnet[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-'
+                  f'{pod_range[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-')
+
+      # if it's the last element, don't add OR to the filter
+      if pod_range == pod_ranges[-1]:
+        filter_str.append(f'{fltr}:"{resource}"')
+      else:
+        filter_str.append(f'{fltr}:"{resource}" OR')
+
+    # add closing parenthesis to the query
+    filter_str.append(')')
     filter_str = '\n'.join(filter_str)
+    ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str)
 
-    # Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging.
-    op.info(f'Searching cloud logging for the string {error_msg} '
-            'which indicates IP Exhaustion issue')
-    ip_space_exhausted_log_entries = local_realtime_query(filter_str)
+    if ip_space_exhausted_pod_range_log_entries:
+      op.info(
+          'Verifying if the cluster is an Autopilot cluster or a Standard cluster...'
+      )
 
-    # Check cloud log entries for IP exhaustion.
-    if ip_space_exhausted_log_entries:
-      op.info(f'log entries with {error_msg} found in cloud logging for '
-              f'the cluster {name}')
-      op.add_failed(cluster,
-                    reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
-                    remediation=op.prep_msg(op.FAILURE_REMEDIATION))
+      if cluster.is_autopilot:
+        op.info('Cluster is an Autopilot cluster')
+        op.add_failed(cluster,
+                      reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
+                      remediation=op.prep_msg(op.FAILURE_REMEDIATION))
+      else:
+        op.info('Cluster is a Standard cluster')
+        op.add_failed(cluster,
+                      reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
+                      remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1))
     else:
-      op.add_ok(cluster,
-                reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name))
+      op.add_ok(
+          cluster,
+          reason=
+          (f'No Pod IP exhaustion issues found for cluster {name} in the project {project}'
+          ))
 
 
 class IpExhaustionEnd(runbook.EndStep):

diff --git a/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt b/gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt
@@ -18,21 +18,22 @@ gke/ip-exhaustion: Troubleshooting ip exhaustion issues on GKE clusters.
      Cluster cluster-1 found in us-central1-c for project gcpdiag-gke3-gggg
 [AUTOMATED STEP]: Checking node IP Exhaustion and offering remediation steps
 [INFO]: Searching cloud logging for the string IP_SPACE_EXHAUSTED which indicates IP Exhaustion issue
-[INFO]: log entries with IP_SPACE_EXHAUSTED found in cloud logging for the cluster cluster-1
+[INFO]: IP_SPACE_EXHAUSTED error found for cluster cluster-1 and subnet public-subnet
 
    - gcpdiag-gke3-gggg/us-central1-c/cluster-1                            [FAIL]
      [REASON]
-     Node IP exhaustion is detected in the cluster cluster-1
+     Node IP exhaustion is detected in the cluster cluster-1 for the subnet public-subnet
 
 
      [REMEDIATION]
      Please follow the below documentation [1] to expand the ip range of the node subnet.
+     The subnet that has exhausted its IP space is public-subnet.
 
      [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet
 
 [AUTOMATED STEP]: Checking Pod IP Exhaustion and offering remediation steps
 [INFO]: Verifying if the cluster is an Autopilot cluster or a Standard cluster...
-[INFO]: Cluster is a standard cluster
+[INFO]: Cluster is a Standard cluster
 
    - gcpdiag-gke3-gggg/us-central1-c/cluster-1                            [FAIL]
      [REASON]

diff --git a/gcpdiag/runbook/gke/templates/ipexhaustion.jinja b/gcpdiag/runbook/gke/templates/ipexhaustion.jinja
@@ -30,12 +30,13 @@ No Node IP exhaustion detected in the cluster {cluster_name}
 {% endblock node_ip_exhaustion_success_reason %}
 
 {% block node_ip_exhaustion_failure_reason %}
-Node IP exhaustion is detected in the cluster {cluster_name}
+Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet}
 
 {% endblock node_ip_exhaustion_failure_reason %}
 
 {% block node_ip_exhaustion_failure_remediation %}
 Please follow the below documentation [1] to expand the ip range of the node subnet.
+The subnet that has exhausted its IP space is {node_subnet}.
 
 [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet
 {% endblock node_ip_exhaustion_failure_remediation %}
diff --git a/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md b/website/content/en/runbook/steps/gke/node-ip-range-exhaustion.md
@@ -16,12 +16,13 @@ Checks Node IP range exhaustion and offers remediation step.
 
 ### Failure Reason
 
-Node IP exhaustion is detected in the cluster {cluster_name}
+Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet}
 
 
 ### Failure Remediation
 
 Please follow the below documentation [1] to expand the ip range of the node subnet.
+The subnet that has exhausted its IP space is {node_subnet}.
 
 [1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet