Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add metric in notifier-selfcheck #910

Merged
merged 8 commits into from
Sep 14, 2023
9 changes: 9 additions & 0 deletions cmd/notifier/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ type selfStateConfig struct {
Contacts []map[string]string `yaml:"contacts"`
// Self state monitor alerting interval
NoticeInterval string `yaml:"notice_interval"`
// Self state monitor check interval
CheckInterval string `yaml:"check_interval"`
}

func getDefault() config {
Expand Down Expand Up @@ -206,12 +208,19 @@ func checkDateTimeFormat(format string) error {
}

func (config *selfStateConfig) getSettings() selfstate.Config {
// 10 sec is default check value
checkInterval := 10 * time.Second
if config.CheckInterval != "" {
checkInterval = to.Duration(config.CheckInterval)
}

return selfstate.Config{
Enabled: config.Enabled,
RedisDisconnectDelaySeconds: int64(to.Duration(config.RedisDisconnectDelay).Seconds()),
LastMetricReceivedDelaySeconds: int64(to.Duration(config.LastMetricReceivedDelay).Seconds()),
LastCheckDelaySeconds: int64(to.Duration(config.LastCheckDelay).Seconds()),
LastRemoteCheckDelaySeconds: int64(to.Duration(config.LastRemoteCheckDelay).Seconds()),
CheckInterval: checkInterval,
Contacts: config.Contacts,
NoticeIntervalSeconds: int64(to.Duration(config.NoticeInterval).Seconds()),
}
Expand Down
15 changes: 11 additions & 4 deletions cmd/notifier/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func main() {
}
defer logger.Info().
String("moira_version", MoiraVersion).
Msg("Moira Notifier stopped. Version")
Msg("Moira Notifier stopped.")

telemetry, err := cmd.ConfigureTelemetry(logger, config.Telemetry, serviceName)
if err != nil {
Expand All @@ -79,7 +79,6 @@ func main() {
}
defer telemetry.Stop()

notifierMetrics := metrics.ConfigureNotifierMetrics(telemetry.Metrics, serviceName)
databaseSettings := config.Redis.GetSettings()
notificationHistorySettings := config.NotificationHistory.GetSettings()
database := redis.NewDatabase(logger, databaseSettings, notificationHistorySettings, redis.Notifier)
Expand All @@ -103,7 +102,15 @@ func main() {

notifierConfig := config.Notifier.getSettings(logger)

sender := notifier.NewNotifier(database, logger, notifierConfig, notifierMetrics, metricSourceProvider, imageStoreMap)
notifierMetrics := metrics.ConfigureNotifierMetrics(telemetry.Metrics, serviceName)
sender := notifier.NewNotifier(
database,
logger,
notifierConfig,
notifierMetrics,
metricSourceProvider,
imageStoreMap,
)

// Register moira senders
if err := sender.RegisterSenders(database); err != nil {
Expand All @@ -114,7 +121,7 @@ func main() {

// Start moira self state checker
if config.Notifier.SelfState.getSettings().Enabled {
selfState := selfstate.NewSelfCheckWorker(logger, database, sender, config.Notifier.SelfState.getSettings())
selfState := selfstate.NewSelfCheckWorker(logger, database, sender, config.Notifier.SelfState.getSettings(), metrics.ConfigureHeartBeatMetrics(telemetry.Metrics))
if err := selfState.Start(); err != nil {
logger.Fatal().
Error(err).
Expand Down
22 changes: 22 additions & 0 deletions metrics/heartbeat.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package metrics

// HeartBeatMetrics is a collection of metrics used in hearbeats
type HeartBeatMetrics struct {
notifierIsAlive Meter
}

// ConfigureHeartBeatMetrics is notifier metrics configurator
func ConfigureHeartBeatMetrics(registry Registry) *HeartBeatMetrics {
return &HeartBeatMetrics{
notifierIsAlive: registry.NewMeter("", "alive"),
}
}

// MarkNotifierIsAlive marks metric value.
func (hb HeartBeatMetrics) MarkNotifierIsAlive(isAlive bool) {
if isAlive {
hb.notifierIsAlive.Mark(1)
}

hb.notifierIsAlive.Mark(0)
}
2 changes: 1 addition & 1 deletion notifier/notifications/notifications.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func (worker *FetchNotificationsWorker) Start() {
switch err.(type) {
case notifierInBadStateError:
worker.Logger.Warning().
String("stop_sending_notofocations_for", sleepAfterNotifierBadState.String()).
String("stop_sending_notifications_for", sleepAfterNotifierBadState.String()).
Error(err).
Msg("Stop sending notifications for some time. Fix SelfState errors and turn on notifier in /notifications page")
<-time.After(sleepAfterNotifierBadState)
Expand Down
36 changes: 18 additions & 18 deletions notifier/selfstate/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
func (selfCheck *SelfCheckWorker) selfStateChecker(stop <-chan struct{}) error {
selfCheck.Logger.Info().Msg("Moira Notifier Self State Monitor started")

checkTicker := time.NewTicker(defaultCheckInterval)
checkTicker := time.NewTicker(selfCheck.Config.CheckInterval)
defer checkTicker.Stop()

nextSendErrorMessage := time.Now().Unix()
Expand All @@ -23,33 +23,35 @@ func (selfCheck *SelfCheckWorker) selfStateChecker(stop <-chan struct{}) error {
selfCheck.Logger.Info().Msg("Moira Notifier Self State Monitor stopped")
return nil
case <-checkTicker.C:
selfCheck.Logger.Debug().
Int64("nextSendErrorMessage", nextSendErrorMessage).
Msg("call check")

nextSendErrorMessage = selfCheck.check(time.Now().Unix(), nextSendErrorMessage)
}
}
}

func (selfCheck *SelfCheckWorker) handleCheckServices(nowTS int64) []moira.NotificationEvent {
var events []moira.NotificationEvent //nolint
var events []moira.NotificationEvent

for _, heartbeat := range selfCheck.heartbeats {
currentValue, needSend, err := heartbeat.Check(nowTS)
currentValue, hasErrors, err := heartbeat.Check(nowTS)
if err != nil {
selfCheck.Logger.Error().
Error(err).
Msg("Heartbeat failed")
}

if !needSend {
continue
}
if hasErrors {
events = append(events, generateNotificationEvent(heartbeat.GetErrorMessage(), currentValue))
if heartbeat.NeedTurnOffNotifier() {
selfCheck.setNotifierState(moira.SelfStateERROR)
}

events = append(events, generateNotificationEvent(heartbeat.GetErrorMessage(), currentValue))
if heartbeat.NeedTurnOffNotifier() {
selfCheck.setNotifierState(moira.SelfStateERROR)
}

if !heartbeat.NeedToCheckOthers() {
break
if !heartbeat.NeedToCheckOthers() {
break
}
}
}

Expand All @@ -67,11 +69,9 @@ func (selfCheck *SelfCheckWorker) sendNotification(events []moira.NotificationEv
}

func (selfCheck *SelfCheckWorker) check(nowTS int64, nextSendErrorMessage int64) int64 {
if nextSendErrorMessage < nowTS {
events := selfCheck.handleCheckServices(nowTS)
if len(events) > 0 {
nextSendErrorMessage = selfCheck.sendNotification(events, nowTS)
}
events := selfCheck.handleCheckServices(nowTS)
if nextSendErrorMessage < nowTS && len(events) > 0 {
nextSendErrorMessage = selfCheck.sendNotification(events, nowTS)
}

return nextSendErrorMessage
Expand Down
3 changes: 3 additions & 0 deletions notifier/selfstate/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package selfstate

import (
"fmt"
"time"
)

// Config is representation of self state worker settings like moira admins contacts and threshold values for checked services
Expand All @@ -12,6 +13,7 @@ type Config struct {
LastCheckDelaySeconds int64
LastRemoteCheckDelaySeconds int64
NoticeIntervalSeconds int64
CheckInterval time.Duration
Contacts []map[string]string
}

Expand All @@ -30,5 +32,6 @@ func (config *Config) checkConfig(senders map[string]bool) error {
return fmt.Errorf("value for [%s] must be present", adminContact["type"])
}
}

return nil
}
25 changes: 19 additions & 6 deletions notifier/selfstate/heartbeat/notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,42 @@ package heartbeat
import (
"fmt"

"github.com/moira-alert/moira/metrics"

"github.com/moira-alert/moira"
)

type notifier struct {
db moira.Database
log moira.Logger
db moira.Database
log moira.Logger
metrics *metrics.HeartBeatMetrics
}

func GetNotifier(logger moira.Logger, database moira.Database) Heartbeater {
func GetNotifier(logger moira.Logger, database moira.Database, metrics *metrics.HeartBeatMetrics) Heartbeater {
return &notifier{
db: database,
log: logger,
db: database,
log: logger,
metrics: metrics,
}
}

func (check notifier) Check(int64) (int64, bool, error) {
if state, _ := check.db.GetNotifierState(); state != moira.SelfStateOK {
state, _ := check.db.GetNotifierState()
almostinf marked this conversation as resolved.
Show resolved Hide resolved
if state != moira.SelfStateOK {
check.metrics.MarkNotifierIsAlive(true)

check.log.Error().
String("error", check.GetErrorMessage()).
Msg("Notifier is not healthy")

return 0, true, nil
}
check.metrics.MarkNotifierIsAlive(false)

check.log.Debug().
String("state", state).
Msg("Notifier is healthy")

return 0, false, nil
}

Expand Down
5 changes: 4 additions & 1 deletion notifier/selfstate/heartbeat/notifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"testing"
"time"

"github.com/moira-alert/moira/metrics"

"github.com/moira-alert/moira"
mock_moira_alert "github.com/moira-alert/moira/mock/moira-alert"

Expand Down Expand Up @@ -45,6 +47,7 @@ func TestNotifierState(t *testing.T) {
func createNotifierStateTest(t *testing.T) *notifier {
mockCtrl := gomock.NewController(t)
logger, _ := logging.GetLogger("MetricDelay")
metric := metrics.ConfigureHeartBeatMetrics(metrics.NewDummyRegistry())

return GetNotifier(logger, mock_moira_alert.NewMockDatabase(mockCtrl)).(*notifier)
return GetNotifier(logger, mock_moira_alert.NewMockDatabase(mockCtrl), metric).(*notifier)
}
12 changes: 6 additions & 6 deletions notifier/selfstate/selfstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package selfstate
import (
"time"

"github.com/moira-alert/moira/metrics"

"github.com/moira-alert/moira/notifier/selfstate/heartbeat"

"gopkg.in/tomb.v2"
Expand All @@ -12,8 +14,6 @@ import (
w "github.com/moira-alert/moira/worker"
)

var defaultCheckInterval = time.Second * 10

const selfStateLockName = "moira-self-state-monitor"
const selfStateLockTTL = time.Second * 15

Expand All @@ -28,8 +28,8 @@ type SelfCheckWorker struct {
}

// NewSelfCheckWorker creates SelfCheckWorker.
func NewSelfCheckWorker(logger moira.Logger, database moira.Database, notifier notifier.Notifier, config Config) *SelfCheckWorker {
heartbeats := createStandardHeartbeats(logger, database, config)
func NewSelfCheckWorker(logger moira.Logger, database moira.Database, notifier notifier.Notifier, config Config, metrics *metrics.HeartBeatMetrics) *SelfCheckWorker {
heartbeats := createStandardHeartbeats(logger, database, config, metrics)
return &SelfCheckWorker{Logger: logger, Database: database, Notifier: notifier, Config: config, heartbeats: heartbeats}
}

Expand Down Expand Up @@ -59,7 +59,7 @@ func (selfCheck *SelfCheckWorker) Stop() error {
return selfCheck.tomb.Wait()
}

func createStandardHeartbeats(logger moira.Logger, database moira.Database, conf Config) []heartbeat.Heartbeater {
func createStandardHeartbeats(logger moira.Logger, database moira.Database, conf Config, metrics *metrics.HeartBeatMetrics) []heartbeat.Heartbeater {
heartbeats := make([]heartbeat.Heartbeater, 0)

if hb := heartbeat.GetDatabase(conf.RedisDisconnectDelaySeconds, logger, database); hb != nil {
Expand All @@ -78,7 +78,7 @@ func createStandardHeartbeats(logger moira.Logger, database moira.Database, conf
heartbeats = append(heartbeats, hb)
}

if hb := heartbeat.GetNotifier(logger, database); hb != nil {
if hb := heartbeat.GetNotifier(logger, database, metrics); hb != nil {
heartbeats = append(heartbeats, hb)
}

Expand Down
8 changes: 6 additions & 2 deletions notifier/selfstate/selfstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"testing"
"time"

"github.com/moira-alert/moira/metrics"

mock_heartbeat "github.com/moira-alert/moira/mock/heartbeat"
"github.com/moira-alert/moira/notifier/selfstate/heartbeat"

Expand Down Expand Up @@ -143,7 +145,6 @@ func configureWorker(t *testing.T, isStart bool) *selfCheckWorkerMock {
"type": "admin-mail",
"value": "[email protected]",
}
defaultCheckInterval = time.Second * 1
conf := Config{
Enabled: true,
Contacts: []map[string]string{
Expand All @@ -154,6 +155,7 @@ func configureWorker(t *testing.T, isStart bool) *selfCheckWorkerMock {
LastCheckDelaySeconds: 120,
NoticeIntervalSeconds: 60,
LastRemoteCheckDelaySeconds: 120,
CheckInterval: 1 * time.Second,
Tetrergeru marked this conversation as resolved.
Show resolved Hide resolved
}

mockCtrl := gomock.NewController(t)
Expand All @@ -172,9 +174,11 @@ func configureWorker(t *testing.T, isStart bool) *selfCheckWorkerMock {
database.EXPECT().NewLock(gomock.Any(), gomock.Any()).Return(lock)
}

metric := &metrics.HeartBeatMetrics{}

return &selfCheckWorkerMock{

selfCheckWorker: NewSelfCheckWorker(logger, database, notif, conf),
selfCheckWorker: NewSelfCheckWorker(logger, database, notif, conf, metric),
database: database,
notif: notif,
conf: conf,
Expand Down
Loading