Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: remote graphite retries #1085

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
19f04a9
feat: Added retry logic to remote api request
Jun 17, 2022
3122c96
refactor: update with master
AleksandrMatsko Sep 18, 2024
cb08142
refactor: local checker config
AleksandrMatsko Sep 18, 2024
c0890f7
refactor: config structs, remove unused functions
AleksandrMatsko Sep 18, 2024
8c52507
refactor: remove comments, add new lines
AleksandrMatsko Sep 18, 2024
722a211
refactor: remote graphite tests
AleksandrMatsko Sep 18, 2024
38cca14
refactor: remove comment
AleksandrMatsko Sep 18, 2024
85bc18c
refactor: api config
AleksandrMatsko Sep 19, 2024
48dc77a
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 1, 2024
f5c96c1
refactor: recognising isUnavailable code, refactor tests
AleksandrMatsko Oct 1, 2024
6842542
refactor: notifier config
AleksandrMatsko Oct 2, 2024
497eb29
refactor: remove empty line
AleksandrMatsko Oct 2, 2024
8c00a84
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 2, 2024
6db0ad2
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 4, 2024
f922144
merge master into feat/remote-graphite-retries
AleksandrMatsko Oct 8, 2024
2e54649
feat: retries config in cmd
AleksandrMatsko Oct 9, 2024
409b734
feat: retries logic
AleksandrMatsko Oct 9, 2024
a98dd11
test: exponential backoff factory and add godocs
AleksandrMatsko Oct 9, 2024
4e03075
test: finish fot exponential backoff factory
AleksandrMatsko Oct 9, 2024
cdace7d
test: for retrier
AleksandrMatsko Oct 9, 2024
f83cace
refactor: (work in progress) remote metric source to use retrier
AleksandrMatsko Oct 9, 2024
7826e24
refactor: configs add tests for configs
AleksandrMatsko Oct 11, 2024
0edd1c5
tests: fix for requests
AleksandrMatsko Oct 11, 2024
ab98963
refactor: add testcases to config_test for remote source, use linter
AleksandrMatsko Oct 15, 2024
acc1c67
refactor: fix remote tests, clock interface
AleksandrMatsko Oct 15, 2024
5e95fdf
merge origin/master into feat/remote-graphite-retries
AleksandrMatsko Oct 15, 2024
139d701
refactor: tests
AleksandrMatsko Oct 15, 2024
8be8caa
refactor: error initialization
AleksandrMatsko Oct 15, 2024
2c87158
style: use linter
AleksandrMatsko Oct 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clock/clock.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ func NewSystemClock() *SystemClock {
return &SystemClock{}
}

// Now returns now time.Time with UTC location.
// NowUTC returns now time.Time with UTC location.
func (t *SystemClock) NowUTC() time.Time {
return time.Now().UTC()
}

// Now returns now time.Time as a Unix time.
// NowUnix returns current time in a Unix time format.
func (t *SystemClock) NowUnix() int64 {
return time.Now().Unix()
}
61 changes: 52 additions & 9 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"

"github.com/moira-alert/moira"
"github.com/moira-alert/moira/metric_source/retries"
"github.com/moira-alert/moira/metrics"

"github.com/moira-alert/moira/image_store/s3"
Expand Down Expand Up @@ -230,15 +231,54 @@ type remoteCommon interface {
getRemoteCommon() *RemoteCommonConfig
}

// RetriesConfig is a settings for retry policy when performing requests to remote sources.
// Stop retrying when ONE of the following conditions is satisfied:
// - Time passed since first try is greater than MaxElapsedTime;
// - Already MaxRetriesCount done.
type RetriesConfig struct {
// InitialInterval between requests.
InitialInterval string `yaml:"initial_interval"`
// RandomizationFactor is used in exponential backoff to add some randomization
// when calculating next interval between requests.
// It will be used in multiplication like:
// RandomizedInterval = RetryInterval * (random value in range [1 - RandomizationFactor, 1 + RandomizationFactor])
RandomizationFactor float64 `yaml:"randomization_factor"`
// Each new RetryInterval will be multiplied on Multiplier.
Multiplier float64 `yaml:"multiplier"`
// MaxInterval is the cap for RetryInterval. Note that it doesn't cap the RandomizedInterval.
MaxInterval string `yaml:"max_interval"`
// MaxElapsedTime caps the time passed from first try. If time passed is greater than MaxElapsedTime than stop retrying.
MaxElapsedTime string `yaml:"max_elapsed_time"`
// MaxRetriesCount is the amount of allowed retries. So at most MaxRetriesCount will be performed.
MaxRetriesCount uint64 `yaml:"max_retries_count"`
}

func (config RetriesConfig) getRetriesSettings() retries.Config {
return retries.Config{
InitialInterval: to.Duration(config.InitialInterval),
RandomizationFactor: config.RandomizationFactor,
Multiplier: config.Multiplier,
MaxInterval: to.Duration(config.MaxInterval),
MaxElapsedTime: to.Duration(config.MaxElapsedTime),
MaxRetriesCount: config.MaxRetriesCount,
}
}

// GraphiteRemoteConfig is remote graphite settings structure.
type GraphiteRemoteConfig struct {
RemoteCommonConfig `yaml:",inline"`
// Timeout for remote requests
// Timeout for remote requests.
Timeout string `yaml:"timeout"`
// Username for basic auth
// Username for basic auth.
User string `yaml:"user"`
// Password for basic auth
// Password for basic auth.
Password string `yaml:"password"`
// Retries configuration for general requests to remote graphite.
Retries RetriesConfig `yaml:"retries"`
// HealthcheckTimeout is timeout for remote api health check requests.
HealthcheckTimeout string `yaml:"health_check_timeout"`
// HealthCheckRetries configuration for healthcheck requests to remote graphite.
HealthCheckRetries RetriesConfig `yaml:"health_check_retries"`
}

func (config GraphiteRemoteConfig) getRemoteCommon() *RemoteCommonConfig {
Expand All @@ -248,12 +288,15 @@ func (config GraphiteRemoteConfig) getRemoteCommon() *RemoteCommonConfig {
// GetRemoteSourceSettings returns remote config parsed from moira config files.
func (config *GraphiteRemoteConfig) GetRemoteSourceSettings() *graphiteRemoteSource.Config {
return &graphiteRemoteSource.Config{
URL: config.URL,
CheckInterval: to.Duration(config.CheckInterval),
MetricsTTL: to.Duration(config.MetricsTTL),
Timeout: to.Duration(config.Timeout),
User: config.User,
Password: config.Password,
URL: config.URL,
CheckInterval: to.Duration(config.CheckInterval),
MetricsTTL: to.Duration(config.MetricsTTL),
Timeout: to.Duration(config.Timeout),
User: config.User,
Password: config.Password,
Retries: config.Retries.getRetriesSettings(),
HealthcheckTimeout: to.Duration(config.HealthcheckTimeout),
HealthcheckRetries: config.HealthCheckRetries.getRetriesSettings(),
}
}

Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ require (
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver/v3 v3.2.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/dyatlov/go-opengraph/opengraph v0.0.0-20220524092352-606d7b1e5f8a // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/go-openapi/jsonpointer v0.20.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ github.com/bwmarrin/discordgo v0.25.0 h1:NXhdfHRNxtwso6FPdzW2i3uBvvU7UIQTghmV2T4
github.com/bwmarrin/discordgo v0.25.0/go.mod h1:NJZpH+1AfhIcyQsPeuBKsUtYrRnjkyu0kIVMCHkZtRY=
github.com/carlosdp/twiliogo v0.0.0-20161027183705-b26045ebb9d1 h1:hXakhQtPnXH839q1pBl/GqfTSchqE+R5Fqn98Iu7UQM=
github.com/carlosdp/twiliogo v0.0.0-20161027183705-b26045ebb9d1/go.mod h1:pAxCBpjl/0JxYZlWGP/Dyi8f/LQSCQD2WAsG/iNzqQ8=
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
Expand Down
15 changes: 14 additions & 1 deletion local/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,21 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retries:
initial_interval: 60s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 120s
max_retries_count: 3
health_check_timeout: 6s
health_check_retries:
initial_interval: 20s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 80s
max_retries_count: 3
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
15 changes: 14 additions & 1 deletion local/checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,21 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retries:
initial_interval: 60s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 120s
max_retries_count: 3
health_check_timeout: 6s
health_check_retries:
initial_interval: 20s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 80s
max_retries_count: 3
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
15 changes: 14 additions & 1 deletion local/notifier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,21 @@ graphite_remote:
cluster_name: Graphite 1
url: "http://graphite:80/render"
check_interval: 60s
timeout: 60s
metrics_ttl: 168h
timeout: 60s
retries:
initial_interval: 60s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 120s
max_retries_count: 3
health_check_timeout: 6s
health_check_retries:
initial_interval: 20s
randomization_factor: 0.5
multiplier: 1.5
max_interval: 80s
max_retries_count: 3
prometheus_remote:
- cluster_id: default
cluster_name: Prometheus 1
Expand Down
54 changes: 47 additions & 7 deletions metric_source/remote/config.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,53 @@
package remote

import "time"
import (
"errors"
"time"

"github.com/moira-alert/moira/metric_source/retries"
)

// Config represents config from remote storage.
type Config struct {
URL string
CheckInterval time.Duration
MetricsTTL time.Duration
Timeout time.Duration
User string
Password string
URL string
CheckInterval time.Duration
MetricsTTL time.Duration
Timeout time.Duration
User string
Password string
HealthcheckTimeout time.Duration
Retries retries.Config
HealthcheckRetries retries.Config
}

var (
errBadRemoteUrl = errors.New("remote graphite URL should not be empty")
errNoTimeout = errors.New("timeout must be specified and can't be 0")
errNoHealthcheckTimeout = errors.New("healthcheck_timeout must be specified and can't be 0")
)

func (conf Config) validate() error {
resErrors := make([]error, 0)

if conf.URL == "" {
resErrors = append(resErrors, errBadRemoteUrl)
}

if conf.Timeout == 0 {
resErrors = append(resErrors, errNoTimeout)
}

if conf.HealthcheckTimeout == 0 {
resErrors = append(resErrors, errNoHealthcheckTimeout)
}

if errRetriesValidate := conf.Retries.Validate(); errRetriesValidate != nil {
resErrors = append(resErrors, errRetriesValidate)
}

if errHealthcheckRetriesValidate := conf.HealthcheckRetries.Validate(); errHealthcheckRetriesValidate != nil {
resErrors = append(resErrors, errHealthcheckRetriesValidate)
}

return errors.Join(resErrors...)
}
88 changes: 88 additions & 0 deletions metric_source/remote/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package remote

import (
"errors"
"fmt"
"testing"
"time"

"github.com/moira-alert/moira/metric_source/retries"

. "github.com/smartystreets/goconvey/convey"
)

func TestConfig_validate(t *testing.T) {
Convey("Test validating retries config", t, func() {
type testcase struct {
caseDesc string
conf Config
expectedErr error
}

var (
testInitialInterval = time.Second * 5
testMaxInterval = time.Second * 10
testRetriesCount uint64 = 10
)

testRetriesConf := retries.Config{
InitialInterval: testInitialInterval,
MaxInterval: testMaxInterval,
MaxRetriesCount: testRetriesCount,
}

cases := []testcase{
{
caseDesc: "with empty config",
conf: Config{},
expectedErr: errors.Join(errBadRemoteUrl, errNoTimeout, errNoHealthcheckTimeout, retries.Config{}.Validate(), retries.Config{}.Validate()),
},
{
caseDesc: "with retries config set",
conf: Config{
Retries: testRetriesConf,
HealthcheckRetries: testRetriesConf,
},
expectedErr: errors.Join(errBadRemoteUrl, errNoTimeout, errNoHealthcheckTimeout),
},
{
caseDesc: "with retries config set and some url",
conf: Config{
URL: "http://test-graphite",
Retries: testRetriesConf,
HealthcheckRetries: testRetriesConf,
},
expectedErr: errors.Join(errNoTimeout, errNoHealthcheckTimeout),
},
{
caseDesc: "with retries config set, some url, timeout",
conf: Config{
Timeout: time.Second,
URL: "http://test-graphite",
Retries: testRetriesConf,
HealthcheckRetries: testRetriesConf,
},
expectedErr: errors.Join(errNoHealthcheckTimeout),
},
{
caseDesc: "with valid config",
conf: Config{
Timeout: time.Second,
HealthcheckTimeout: time.Millisecond,
URL: "http://test-graphite",
Retries: testRetriesConf,
HealthcheckRetries: testRetriesConf,
},
expectedErr: nil,
},
}

for i := range cases {
Convey(fmt.Sprintf("Case %d: %s", i+1, cases[i].caseDesc), func() {
err := cases[i].conf.validate()

So(err, ShouldResemble, cases[i].expectedErr)
})
}
})
}
Loading
Loading