Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: try recovering from gateway sync failure only on UpdateError #6237

Merged
merged 1 commit into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ Adding a new version? You'll need three changes:
- Services using `Secret`s containing the same certificate as client certificates
by annotation `konghq.com/client-cert` can be correctly translated.
[#6228](https://github.com/Kong/kubernetes-ingress-controller/pull/6228)
- Do not try recovering from gateways synchronization errors with fallback configuration
(either generated or the last valid one) when an unexpected error (e.g. 5xx or network issue) occurs.
[#6237](https://github.com/Kong/kubernetes-ingress-controller/pull/6237)

## 3.2.0

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ require (
github.com/google/uuid v1.6.0
github.com/jpillora/backoff v1.0.0
github.com/kong/go-database-reconciler v1.12.2
github.com/kong/go-kong v0.55.0
github.com/kong/go-kong v0.56.0
github.com/kong/kubernetes-telemetry v0.1.4
github.com/kong/kubernetes-testing-framework v0.47.1
github.com/lithammer/dedent v1.1.0
Expand Down
6 changes: 3 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDsl
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
github.com/go-task/slim-sprig v2.20.0+incompatible h1:4Xh3bDzO29j4TWNOI+24ubc0vbVFMg2PMnXKxK54/CA=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
Expand Down Expand Up @@ -240,8 +240,8 @@ github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJw
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/kong/go-database-reconciler v1.12.2 h1:bnlvLCgP4OjgJOK5JYqq1MlIJ2F7erXERMj8n/LNU8M=
github.com/kong/go-database-reconciler v1.12.2/go.mod h1:bUPJkoeW//x4hzNxewQMoIkeoDzJzunI0stDMYJ3BkU=
github.com/kong/go-kong v0.55.0 h1:lonKRzsDGk12dh9E+y+pWnY2ThXhKuMHjzBHSpCvQLw=
github.com/kong/go-kong v0.55.0/go.mod h1:i1cMgTu6RYPHSyMpviShddRnc+DML/vlpgKC00hr8kU=
github.com/kong/go-kong v0.56.0 h1:/9qbnQJWAgrSAKzL2RViBhHMTYOEyG8N4ClkKnUwEW4=
github.com/kong/go-kong v0.56.0/go.mod h1:gyNwyP1fzztT6sX/0/ygMQ30OiRMIQ51b2jSfstMrcU=
github.com/kong/kubernetes-telemetry v0.1.4 h1:Yz7OlECxWKgNRG1wJ5imA4+H0dQEpdU9d86uhwUVpu4=
github.com/kong/kubernetes-telemetry v0.1.4/go.mod h1:z3yB5naZjUmQCFjzD3hsJ9PVYar7BgPJFKzskUuOejU=
github.com/kong/kubernetes-testing-framework v0.47.1 h1:BE2mQLC0Zj/NC5Y8B1Nm5VZymmrdVfu7cfwVKSobWtg=
Expand Down
78 changes: 37 additions & 41 deletions internal/dataplane/kong_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,6 @@ type KongClient struct {
// While lastProcessedSnapshotHash keeps track of the last processed cache snapshot (the one kept in KongClient.cache),
// lastValidCacheSnapshot can also represent the fallback cache snapshot that was successfully synced with gateways.
lastValidCacheSnapshot *store.CacheStores

// brokenObjects is a list of the Kubernetes resources that failed to sync and triggered a fallback sync.
brokenObjects []fallback.ObjectHash
}

// NewKongClient provides a new KongClient object after connecting to the
Expand Down Expand Up @@ -453,6 +450,8 @@ func (c *KongClient) Update(ctx context.Context) error {
}
// Empty snapshot hash means that the cache hasn't changed since the last snapshot was taken. That optimization can be used
// in main code path to avoid unnecessary processing. TODO: https://github.com/Kong/kubernetes-ingress-controller/issues/6095
// TODO: We should short-circuit here only if all the Gateways were successfully synced with the current configuration.
// https://github.com/Kong/kubernetes-ingress-controller/issues/6219
if newSnapshotHash == store.SnapshotHashEmpty {
c.prometheusMetrics.RecordProcessedConfigSnapshotCacheHit()
c.logger.V(util.DebugLevel).Info("No configuration change; pushing config to gateway is not necessary, skipping")
Expand Down Expand Up @@ -493,7 +492,7 @@ func (c *KongClient) Update(ctx context.Context) error {

// In case of a failure in syncing configuration with Gateways, propagate the error.
if gatewaysSyncErr != nil {
if recoveringErr := c.tryRecoveringFromGatewaysSyncError(
if recoveringErr := c.maybeTryRecoveringFromGatewaysSyncError(
ctx,
cacheSnapshot,
gatewaysSyncErr,
Expand Down Expand Up @@ -532,28 +531,45 @@ func (c *KongClient) maybePreserveTheLastValidConfigCache(lastValidCache store.C
}
}

// tryRecoveringFromGatewaysSyncError tries to recover from a configuration rejection by:
// 1. Generating a fallback configuration and pushing it to the gateways if FallbackConfiguration feature is enabled.
// 2. Applying the last valid configuration to the gateways if FallbackConfiguration is disabled or fallback
// configuration generation fails.
func (c *KongClient) tryRecoveringFromGatewaysSyncError(
// maybeTryRecoveringFromGatewaysSyncError tries to recover from a configuration rejection if the error is of the expected
// UpdateError type. Otherwise, we assume the error is non-recoverable by means of fallback configuration generation
// or applying the last valid configuration, and we need to rely on the retry mechanism. It can happen in case of
// transient network issues, internal server errors, etc.
//
// The recovery can be handled in two ways:
// 1. Generating a fallback configuration and pushing it to the gateways if FallbackConfiguration feature is enabled and
// the UpdateError contains at least 1 broken object.
// 2. Applying the last valid configuration to the gateways if FallbackConfiguration is disabled, fallback
// configuration generation fails, or the UpdateError does not contain any broken objects (it can happen if a gateway
// returns 400 with no meaningful entities' errors).
func (c *KongClient) maybeTryRecoveringFromGatewaysSyncError(
ctx context.Context,
cacheSnapshot store.CacheStores,
gatewaysSyncErr error,
) error {
// If configuration was rejected by the gateways and FallbackConfiguration is enabled,
// we should generate a fallback configuration and push it to the gateways.
if c.kongConfig.FallbackConfiguration {
recoveringErr := c.tryRecoveringWithFallbackConfiguration(ctx, cacheSnapshot, gatewaysSyncErr)
// If the error is not of the expected UpdateError type, we should log it and skip the recovery.
updateErr := sendconfig.UpdateError{}
if !errors.As(gatewaysSyncErr, &updateErr) {
c.logger.V(util.DebugLevel).Info("Skipping recovery from gateways sync error - not enough details to recover",
"error", gatewaysSyncErr)
return nil
}

// If configuration was rejected by the gateways, we identified at least one broken object and FallbackConfiguration
// is enabled, we should generate a fallback configuration and push it to the gateways.
brokenObjects := extractBrokenObjectsFromUpdateError(updateErr)
if c.kongConfig.FallbackConfiguration && len(brokenObjects) > 0 {
recoveringErr := c.tryRecoveringWithFallbackConfiguration(ctx, cacheSnapshot, brokenObjects)
if recoveringErr == nil {
c.logger.Info("Successfully recovered from configuration rejection with fallback configuration")
return nil
}
// If we failed to recover using the fallback configuration, we should log the error and carry on.
// If we failed to recover using the fallback configuration, we should log the error and carry on with the last
// valid configuration.
c.logger.Error(recoveringErr, "Failed to recover from configuration rejection with fallback configuration")
}

// If FallbackConfiguration is disabled, or we failed to recover using the fallback configuration, we should
// If FallbackConfiguration is disabled, we skipped or failed to recover using the fallback configuration, we should
// apply the last valid configuration to the gateways.
if state, found := c.kongConfigFetcher.LastValidConfig(); found {
const isFallback = true
Expand All @@ -565,23 +581,13 @@ func (c *KongClient) tryRecoveringFromGatewaysSyncError(
return nil
}

func (c *KongClient) cacheBrokenObjectList(list []fallback.ObjectHash) {
c.brokenObjects = list
}

// tryRecoveringWithFallbackConfiguration tries to recover from a configuration rejection by generating a fallback
// configuration excluding affected objects from the cache.
func (c *KongClient) tryRecoveringWithFallbackConfiguration(
ctx context.Context,
currentCache store.CacheStores,
gatewaysSyncErr error,
brokenObjects []fallback.ObjectHash,
) error {
// Extract the broken objects from the update error and generate a fallback configuration excluding them.
brokenObjects, err := extractBrokenObjectsFromUpdateError(gatewaysSyncErr)
if err != nil {
return fmt.Errorf("failed to extract broken objects from update error: %w", err)
}

// Generate a fallback cache snapshot.
fallbackCache, generatedCacheMetadata, err := c.generateFallbackCache(currentCache, brokenObjects)
if err != nil {
Expand All @@ -607,8 +613,7 @@ func (c *KongClient) tryRecoveringWithFallbackConfiguration(
}

const isFallback = true
c.cacheBrokenObjectList(brokenObjects)
_, gatewaysSyncErr = c.sendOutToGatewayClients(ctx, fallbackParsingResult.KongState, c.kongConfig, isFallback)
_, gatewaysSyncErr := c.sendOutToGatewayClients(ctx, fallbackParsingResult.KongState, c.kongConfig, isFallback)
if gatewaysSyncErr != nil {
return fmt.Errorf("failed to sync fallback configuration with gateways: %w", gatewaysSyncErr)
}
Expand Down Expand Up @@ -647,24 +652,15 @@ func (c *KongClient) generateFallbackCache(
)
}

// extractBrokenObjectsFromUpdateError.
func extractBrokenObjectsFromUpdateError(err error) ([]fallback.ObjectHash, error) {
// extractBrokenObjectsFromUpdateError extracts broken objects from the UpdateError.
func extractBrokenObjectsFromUpdateError(err sendconfig.UpdateError) []fallback.ObjectHash {
var brokenObjects []client.Object

var updateErr sendconfig.UpdateError
if ok := errors.As(err, &updateErr); !ok {
return nil, fmt.Errorf("expected UpdateError, cannot extract broken objects from %T", err) //nolint:errorlint
}
for _, resourceFailure := range updateErr.ResourceFailures() {
for _, resourceFailure := range err.ResourceFailures() {
brokenObjects = append(brokenObjects, resourceFailure.CausingObjects()...)
}
if len(brokenObjects) == 0 {
return nil, fmt.Errorf("no broken objects found in UpdateError")
}

return lo.Map(brokenObjects, func(obj client.Object, _ int) fallback.ObjectHash {
return fallback.GetObjectHash(obj)
}), nil
})
}

// sendOutToGatewayClients will generate deck content (config) from the provided kong state
Expand Down
Loading
Loading