Skip to content

Commit

Permalink
Merge "fix runbook functionality to properly detect pod IP exhaustion…
Browse files Browse the repository at this point in the history
… and node IP exhaustion"

-- Branch commit log --
commit dbfdff3eac39b4f5af58cc47d1c6baa71bd11168
Author:  Roman Doroschevici <[email protected]>
Date:    2024-12-03T12:16:50Z

    fix runbook functionality to properly detect pod IP exhaustion and node
IP exhaustion

fix 382018864

Change-Id: I3e7365c4a75998d47c2ddf1a1976ef52d402d23d
GitOrigin-RevId: 1e816830e179fb6b2c503bcc038606226b4f4bbe
  • Loading branch information
Roman Doroschevici authored and copybara-github committed Dec 4, 2024
1 parent 3b48bd2 commit a4392a9
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 59 deletions.
12 changes: 12 additions & 0 deletions gcpdiag/queries/gke.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,18 @@ def subnetwork(self) -> Optional[models.Resource]:
raise RuntimeError("can't parse network string: %s" % subnetwork_string)
return network.get_subnetwork(m.group(1), m.group(2), m.group(3))

@property
def get_subnet_name(self) -> Optional[models.Resource]:
if 'subnetwork' not in self._resource_data:
return None
return self._resource_data['subnetwork']

@property
def get_nodepool_config(self) -> Optional[models.Resource]:
if 'nodePools' not in self._resource_data:
return None
return self._resource_data['nodePools']

@property
def is_private(self) -> bool:
if not 'privateClusterConfig' in self._resource_data:
Expand Down
145 changes: 91 additions & 54 deletions gcpdiag/runbook/gke/ip_exhaustion.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,106 +117,143 @@ def execute(self):
op.get(flags.PROJECT_ID)))


class PodIpRangeExhaustion(runbook.Step):
"""Check Pod IP Range Exhaustion and offer remediation.
class NodeIpRangeExhaustion(runbook.Step):
"""Check Node IP Range Exhaustion and offer remediation.
Checks Pod IP range exhaustion and offers remediation step.
Checks Node IP range exhaustion and offers remediation step.
"""

template = 'ipexhaustion::pod_ip_exhaustion'
template = 'ipexhaustion::node_ip_exhaustion'
# max number of characters from the cluster name that will end up in the node name
MAX_GKE_NAME_LENGTH = 16

def execute(self):
"""Checking Pod IP Exhaustion and offering remediation steps"""
"""Checking node IP Exhaustion and offering remediation steps"""

cluster = gke.get_cluster(op.get(flags.PROJECT_ID),
cluster_id=op.get(flags.NAME),
location=op.get(flags.LOCATION))
project = op.get(flags.PROJECT_ID)
location = op.get(flags.LOCATION)
name = op.get(flags.NAME)
error_msg = 'IP_SPACE_EXHAUSTED'
node_subnet = cluster.get_subnet_name
# adding fltr variable, because otherwise the filter string will be longer than 100 chars
fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName'

# Check for exhaustion of pod range
pod_range_exhausted_error = 'GKE_IP_UTILIZATION_POD_RANGES_ALLOCATION_HIGH'
# Define the query strings to be used to search cloud logging.
# using here ':' instead of '=' for 'protoPayload.status.message' because there could be
# status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED'
filter_str = [
'log_id("networkanalyzer.googleapis.com/analyzer_reports")',
f'jsonPayload.causeCode="{pod_range_exhausted_error}"',
f'jsonPayload.resourceName:"//container.googleapis.com/projects/{project}"',
f'jsonPayload.resourceName:"clusters/{name}"',
f'jsonPayload.resourceName:"{location}"'
'log_id("cloudaudit.googleapis.com/activity")',
'protoPayload.methodName="v1.compute.instances.insert"',
'resource.type="gce_instance"', 'severity=ERROR',
f'protoPayload.status.message:"{error_msg}"',
f'protoPayload.resourceName:"{location}"',
f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"',
f'{fltr}="{node_subnet}"'
]
filter_str = '\n'.join(filter_str)

ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str)
# Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging.
op.info(f'Searching cloud logging for the string {error_msg} '
'which indicates IP Exhaustion issue')
ip_space_exhausted_log_entries = local_realtime_query(filter_str)

if ip_space_exhausted_pod_range_log_entries:
# Check cloud log entries for IP exhaustion.
if ip_space_exhausted_log_entries:
op.info(
'Verifying if the cluster is an Autopilot cluster or a Standard cluster...'
f'{error_msg} error found for cluster {name} and subnet {node_subnet}'
)

if cluster.is_autopilot:
op.info('Cluster is an Autopilot cluster')
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.info('Cluster is a standard cluster')
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1))
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON,
cluster_name=name,
node_subnet=node_subnet),
remediation=op.prep_msg(op.FAILURE_REMEDIATION,
node_subnet=node_subnet))
else:
op.add_ok(
cluster,
reason=
(f'No Pod IP exhaustion issues found for cluster {name} in the project {project}'
))
op.add_ok(cluster,
reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name))


class NodeIpRangeExhaustion(runbook.Step):
"""Check Node IP Range Exhaustion and offer remediation.
class PodIpRangeExhaustion(runbook.Step):
"""Check Pod IP Range Exhaustion and offer remediation.
Checks Node IP range exhaustion and offers remediation step.
Checks Pod IP range exhaustion and offers remediation step.
"""

template = 'ipexhaustion::node_ip_exhaustion'
# max number of characters from the cluster name that will end up in the node name
template = 'ipexhaustion::pod_ip_exhaustion'
MAX_GKE_SUBNET_NAME_LENGTH = 15
MAX_GKE_NAME_LENGTH = 16

def execute(self):
"""Checking node IP Exhaustion and offering remediation steps"""
"""Checking Pod IP Exhaustion and offering remediation steps"""

cluster = gke.get_cluster(op.get(flags.PROJECT_ID),
cluster_id=op.get(flags.NAME),
location=op.get(flags.LOCATION))
project = op.get(flags.PROJECT_ID)
location = op.get(flags.LOCATION)
name = op.get(flags.NAME)
error_msg = 'IP_SPACE_EXHAUSTED'

node_subnet = cluster.get_subnet_name
pool_config = cluster.get_nodepool_config
pool_nr = len(pool_config)
# adding fltr variable, because otherwise the filter string will be longer than 100 chars
fltr = 'protoPayload.status.details.ipSpaceExhausted.networkOrSubnetworkResource.resourceName'

# using here ':' instead of '=' for 'protoPayload.status.message' because there could be
# status messages like 'IP_SPACE_EXHAUSTED,IP_SPACE_EXHAUSTED' instead of 'IP_SPACE_EXHAUSTED'
filter_str = [
'log_id("cloudaudit.googleapis.com/activity")',
'protoPayload.methodName="v1.compute.instances.insert"',
'resource.type="gce_instance"', 'severity=ERROR',
f'protoPayload.status.message="{error_msg}"',
f'protoPayload.status.message:"{error_msg}"',
f'protoPayload.resourceName:"{location}"',
f'protoPayload.resourceName:"{name[:self.MAX_GKE_NAME_LENGTH]}"'
]

# creating a pod_ranges list from a set to exclude duplicates
pod_ranges = list(
{pool_config[i]['networkConfig']['podRange'] for i in range(pool_nr)})
# add opening parenthesis to the query
filter_str.append('(')

# create the filter for all subnets for all nodepools
for pod_range in pod_ranges:
resource = (f'{node_subnet[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-'
f'{pod_range[:self.MAX_GKE_SUBNET_NAME_LENGTH]}-')

# if it's the last element, don't add OR to the filter
if pod_range == pod_ranges[-1]:
filter_str.append(f'{fltr}:"{resource}"')
else:
filter_str.append(f'{fltr}:"{resource}" OR')

# add closing parenthesis to the query
filter_str.append(')')
filter_str = '\n'.join(filter_str)
ip_space_exhausted_pod_range_log_entries = local_realtime_query(filter_str)

# Check activity logs if 'IP_SPACE_EXHAUSTED' log is present in cloud logging.
op.info(f'Searching cloud logging for the string {error_msg} '
'which indicates IP Exhaustion issue')
ip_space_exhausted_log_entries = local_realtime_query(filter_str)
if ip_space_exhausted_pod_range_log_entries:
op.info(
'Verifying if the cluster is an Autopilot cluster or a Standard cluster...'
)

# Check cloud log entries for IP exhaustion.
if ip_space_exhausted_log_entries:
op.info(f'log entries with {error_msg} found in cloud logging for '
f'the cluster {name}')
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
if cluster.is_autopilot:
op.info('Cluster is an Autopilot cluster')
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))
else:
op.info('Cluster is a Standard cluster')
op.add_failed(cluster,
reason=op.prep_msg(op.FAILURE_REASON, cluster_name=name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION_ALT1))
else:
op.add_ok(cluster,
reason=op.prep_msg(op.SUCCESS_REASON, cluster_name=name))
op.add_ok(
cluster,
reason=
(f'No Pod IP exhaustion issues found for cluster {name} in the project {project}'
))


class IpExhaustionEnd(runbook.EndStep):
Expand Down
7 changes: 4 additions & 3 deletions gcpdiag/runbook/gke/snapshots/ip_exhaustion.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@ gke/ip-exhaustion: Troubleshooting ip exhaustion issues on GKE clusters.
Cluster cluster-1 found in us-central1-c for project gcpdiag-gke3-gggg
[AUTOMATED STEP]: Checking node IP Exhaustion and offering remediation steps
[INFO]: Searching cloud logging for the string IP_SPACE_EXHAUSTED which indicates IP Exhaustion issue
[INFO]: log entries with IP_SPACE_EXHAUSTED found in cloud logging for the cluster cluster-1
[INFO]: IP_SPACE_EXHAUSTED error found for cluster cluster-1 and subnet public-subnet

- gcpdiag-gke3-gggg/us-central1-c/cluster-1 [FAIL]
[REASON]
Node IP exhaustion is detected in the cluster cluster-1
Node IP exhaustion is detected in the cluster cluster-1 for the subnet public-subnet


[REMEDIATION]
Please follow the below documentation [1] to expand the ip range of the node subnet.
The subnet that has exhausted its IP space is public-subnet.

[1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet

[AUTOMATED STEP]: Checking Pod IP Exhaustion and offering remediation steps
[INFO]: Verifying if the cluster is an Autopilot cluster or a Standard cluster...
[INFO]: Cluster is a standard cluster
[INFO]: Cluster is a Standard cluster

- gcpdiag-gke3-gggg/us-central1-c/cluster-1 [FAIL]
[REASON]
Expand Down
3 changes: 2 additions & 1 deletion gcpdiag/runbook/gke/templates/ipexhaustion.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ No Node IP exhaustion detected in the cluster {cluster_name}
{% endblock node_ip_exhaustion_success_reason %}

{% block node_ip_exhaustion_failure_reason %}
Node IP exhaustion is detected in the cluster {cluster_name}
Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet}

{% endblock node_ip_exhaustion_failure_reason %}

{% block node_ip_exhaustion_failure_remediation %}
Please follow the below documentation [1] to expand the ip range of the node subnet.
The subnet that has exhausted its IP space is {node_subnet}.

[1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet
{% endblock node_ip_exhaustion_failure_remediation %}
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ Checks Node IP range exhaustion and offers remediation step.

### Failure Reason

Node IP exhaustion is detected in the cluster {cluster_name}
Node IP exhaustion is detected in the cluster {cluster_name} for the subnet {node_subnet}


### Failure Remediation

Please follow the below documentation [1] to expand the ip range of the node subnet.
The subnet that has exhausted its IP space is {node_subnet}.

[1] https://cloud.google.com/vpc/docs/create-modify-vpc-networks#expand-subnet

Expand Down

0 comments on commit a4392a9

Please sign in to comment.