Skip to content

Commit

Permalink
Add exponential backoff mechanism for restarting fdbserver processes …
Browse files Browse the repository at this point in the history
…in the monitor (#11453)
  • Loading branch information
johscheuer authored Jun 12, 2024
1 parent 49bf518 commit b523064
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 17 deletions.
59 changes: 42 additions & 17 deletions fdbkubernetesmonitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,9 @@ import (
"github.com/go-logr/logr"
)

// errorBackoffSeconds is the time to wait after a process fails before starting
// another process.
// This delay will only be applied when there has been more than one failure
// within this time window.
const errorBackoffSeconds = 60
// maxErrorBackoffSeconds is the maximum time to wait after a process fails before starting another process.
// The actual delay will be based on the observed errors and will increase until maxErrorBackoffSeconds is hit.
const maxErrorBackoffSeconds = 60 * time.Second

// Monitor provides the main monitor loop
type Monitor struct {
Expand Down Expand Up @@ -228,20 +226,44 @@ func (monitor *Monitor) acceptConfiguration(configuration *api.ProcessConfigurat
}
}

// getBackoffDuration returns the backoff duration. The backoff time will increase exponential with a maximum of 60 seconds.
func getBackoffDuration(errorCounter int) time.Duration {
timeToBackoff := time.Duration(errorCounter*errorCounter) * time.Second
if timeToBackoff > maxErrorBackoffSeconds {
return maxErrorBackoffSeconds
}

return timeToBackoff
}

// RunProcess runs a loop to continually start and watch a process.
func (monitor *Monitor) RunProcess(processNumber int) {
pid := 0
logger := monitor.Logger.WithValues("processNumber", processNumber, "area", "RunProcess")
logger.Info("Starting run loop")
startTime := time.Now()
// Counts the successive errors that occurred during process start up. Based on the error count the backoff time
// will be calculated.
var errorCounter int

for {
if !monitor.checkProcessRequired(processNumber) {
return
}

durationSinceLastStart := time.Since(startTime)
// If for more than 5 minutes no error have occurred we reset the error counter to reset the backoff time.
if durationSinceLastStart > 5*time.Minute {
errorCounter = 0
}

arguments, err := monitor.ActiveConfiguration.GenerateArguments(processNumber, monitor.CustomEnvironment)
if err != nil {
logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration)
time.Sleep(errorBackoffSeconds * time.Second)
backoffDuration := getBackoffDuration(errorCounter)
logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration, "errorCounter", errorCounter, "backoffDuration", backoffDuration.String())
time.Sleep(backoffDuration)
errorCounter++
continue
}
cmd := exec.Cmd{
Path: arguments[0],
Expand All @@ -262,8 +284,10 @@ func (monitor *Monitor) RunProcess(processNumber int) {

err = cmd.Start()
if err != nil {
logger.Error(err, "Error starting subprocess")
time.Sleep(errorBackoffSeconds * time.Second)
backoffDuration := getBackoffDuration(errorCounter)
logger.Error(err, "Error starting subprocess", "backoffDuration", backoffDuration.String())
time.Sleep(backoffDuration)
errorCounter++
continue
}

Expand All @@ -273,7 +297,7 @@ func (monitor *Monitor) RunProcess(processNumber int) {
logger.Error(nil, "No Process information available for subprocess")
}

startTime := time.Now()
startTime = time.Now()
logger.Info("Subprocess started", "PID", pid)

monitor.updateProcessID(processNumber, pid)
Expand Down Expand Up @@ -305,15 +329,16 @@ func (monitor *Monitor) RunProcess(processNumber int) {
exitCode = cmd.ProcessState.ExitCode()
}

logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid)

endTime := time.Now()
processDuration := time.Since(startTime)
logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid, "lastExecutionDurationSeconds", processDuration.String())
monitor.updateProcessID(processNumber, -1)

processDuration := endTime.Sub(startTime)
if processDuration.Seconds() < errorBackoffSeconds {
logger.Info("Backing off from restarting subprocess", "backOffTimeSeconds", errorBackoffSeconds, "lastExecutionDurationSeconds", processDuration)
time.Sleep(errorBackoffSeconds * time.Second)
// Only backoff if the exit code is non-zero.
if exitCode != 0 {
backoffDuration := getBackoffDuration(errorCounter)
logger.Info("Backing off from restarting subprocess", "backoffDuration", backoffDuration.String(), "lastExecutionDurationSeconds", processDuration.String(), "errorCounter", errorCounter, "exitCode", exitCode)
time.Sleep(backoffDuration)
errorCounter++
}
}
}
Expand Down
54 changes: 54 additions & 0 deletions fdbkubernetesmonitor/monitor_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// monitor_test.go
//
// This source file is part of the FoundationDB open source project
//
// Copyright 2021-2024 Apple Inc. and the FoundationDB project authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package main

import (
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("Testing FDB Kubernetes Monitor", func() {
DescribeTable("when getting the backoff time", func(errorCount int, expected time.Duration) {
Expect(getBackoffDuration(errorCount)).To(Equal(expected))
},
Entry("no errors have occurred",
0,
time.Duration(0),
),
Entry("one error have occurred",
1,
1*time.Second,
),
Entry("two errors have occurred",
2,
4*time.Second,
),
Entry("three errors have occurred",
3,
9*time.Second,
),
Entry("ten errors have occurred, should return the max backoff seconds",
100,
60*time.Second,
),
)
})

0 comments on commit b523064

Please sign in to comment.