From b5230642cab927342fb857995ffb8c652f19821a Mon Sep 17 00:00:00 2001 From: Johannes Scheuermann Date: Wed, 12 Jun 2024 15:08:39 +0200 Subject: [PATCH] Add exponential backoff mechanism for restarting fdbserver processes in the monitor (#11453) --- fdbkubernetesmonitor/monitor.go | 59 ++++++++++++++++++++-------- fdbkubernetesmonitor/monitor_test.go | 54 +++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 17 deletions(-) create mode 100644 fdbkubernetesmonitor/monitor_test.go diff --git a/fdbkubernetesmonitor/monitor.go b/fdbkubernetesmonitor/monitor.go index 2ca667b824c..835842b7617 100644 --- a/fdbkubernetesmonitor/monitor.go +++ b/fdbkubernetesmonitor/monitor.go @@ -43,11 +43,9 @@ import ( "github.com/go-logr/logr" ) -// errorBackoffSeconds is the time to wait after a process fails before starting -// another process. -// This delay will only be applied when there has been more than one failure -// within this time window. -const errorBackoffSeconds = 60 +// maxErrorBackoffSeconds is the maximum time to wait after a process fails before starting another process. +// The actual delay will be based on the observed errors and will increase until maxErrorBackoffSeconds is hit. +const maxErrorBackoffSeconds = 60 * time.Second // Monitor provides the main monitor loop type Monitor struct { @@ -228,20 +226,44 @@ func (monitor *Monitor) acceptConfiguration(configuration *api.ProcessConfigurat } } +// getBackoffDuration returns the backoff duration. The backoff time will increase exponential with a maximum of 60 seconds. +func getBackoffDuration(errorCounter int) time.Duration { + timeToBackoff := time.Duration(errorCounter*errorCounter) * time.Second + if timeToBackoff > maxErrorBackoffSeconds { + return maxErrorBackoffSeconds + } + + return timeToBackoff +} + // RunProcess runs a loop to continually start and watch a process. func (monitor *Monitor) RunProcess(processNumber int) { pid := 0 logger := monitor.Logger.WithValues("processNumber", processNumber, "area", "RunProcess") logger.Info("Starting run loop") + startTime := time.Now() + // Counts the successive errors that occurred during process start up. Based on the error count the backoff time + // will be calculated. + var errorCounter int + for { if !monitor.checkProcessRequired(processNumber) { return } + durationSinceLastStart := time.Since(startTime) + // If for more than 5 minutes no error have occurred we reset the error counter to reset the backoff time. + if durationSinceLastStart > 5*time.Minute { + errorCounter = 0 + } + arguments, err := monitor.ActiveConfiguration.GenerateArguments(processNumber, monitor.CustomEnvironment) if err != nil { - logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration) - time.Sleep(errorBackoffSeconds * time.Second) + backoffDuration := getBackoffDuration(errorCounter) + logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration, "errorCounter", errorCounter, "backoffDuration", backoffDuration.String()) + time.Sleep(backoffDuration) + errorCounter++ + continue } cmd := exec.Cmd{ Path: arguments[0], @@ -262,8 +284,10 @@ func (monitor *Monitor) RunProcess(processNumber int) { err = cmd.Start() if err != nil { - logger.Error(err, "Error starting subprocess") - time.Sleep(errorBackoffSeconds * time.Second) + backoffDuration := getBackoffDuration(errorCounter) + logger.Error(err, "Error starting subprocess", "backoffDuration", backoffDuration.String()) + time.Sleep(backoffDuration) + errorCounter++ continue } @@ -273,7 +297,7 @@ func (monitor *Monitor) RunProcess(processNumber int) { logger.Error(nil, "No Process information available for subprocess") } - startTime := time.Now() + startTime = time.Now() logger.Info("Subprocess started", "PID", pid) monitor.updateProcessID(processNumber, pid) @@ -305,15 +329,16 @@ func (monitor *Monitor) RunProcess(processNumber int) { exitCode = cmd.ProcessState.ExitCode() } - logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid) - - endTime := time.Now() + processDuration := time.Since(startTime) + logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid, "lastExecutionDurationSeconds", processDuration.String()) monitor.updateProcessID(processNumber, -1) - processDuration := endTime.Sub(startTime) - if processDuration.Seconds() < errorBackoffSeconds { - logger.Info("Backing off from restarting subprocess", "backOffTimeSeconds", errorBackoffSeconds, "lastExecutionDurationSeconds", processDuration) - time.Sleep(errorBackoffSeconds * time.Second) + // Only backoff if the exit code is non-zero. + if exitCode != 0 { + backoffDuration := getBackoffDuration(errorCounter) + logger.Info("Backing off from restarting subprocess", "backoffDuration", backoffDuration.String(), "lastExecutionDurationSeconds", processDuration.String(), "errorCounter", errorCounter, "exitCode", exitCode) + time.Sleep(backoffDuration) + errorCounter++ } } } diff --git a/fdbkubernetesmonitor/monitor_test.go b/fdbkubernetesmonitor/monitor_test.go new file mode 100644 index 00000000000..a520c89781f --- /dev/null +++ b/fdbkubernetesmonitor/monitor_test.go @@ -0,0 +1,54 @@ +// monitor_test.go +// +// This source file is part of the FoundationDB open source project +// +// Copyright 2021-2024 Apple Inc. and the FoundationDB project authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Testing FDB Kubernetes Monitor", func() { + DescribeTable("when getting the backoff time", func(errorCount int, expected time.Duration) { + Expect(getBackoffDuration(errorCount)).To(Equal(expected)) + }, + Entry("no errors have occurred", + 0, + time.Duration(0), + ), + Entry("one error have occurred", + 1, + 1*time.Second, + ), + Entry("two errors have occurred", + 2, + 4*time.Second, + ), + Entry("three errors have occurred", + 3, + 9*time.Second, + ), + Entry("ten errors have occurred, should return the max backoff seconds", + 100, + 60*time.Second, + ), + ) +})