Skip to content

Commit

Permalink
Fix buddy race condition, ramdom crash
Browse files Browse the repository at this point in the history
  • Loading branch information
Bjorn Jorgensen committed Apr 23, 2024
1 parent b6289c6 commit a88a693
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 26 deletions.
9 changes: 5 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Changelog
## [v1.7.3] - 2024-04-23
Change:
- Upgrade Go dependencies

## [v1.7.2] - 2024-04-23
Fix:
- Buddy state inconsistencies
- Buddy state race problems, making the app reset dataset
- Change timestamp not updated correctly

## [v1.7.1] - 2024-04-22
Change:
- Upgrade Go dependencies

## [v1.7.0] - 2024-04-18
Add:
- Config element logformat <txt/json>
Expand Down
10 changes: 8 additions & 2 deletions cmd/dashgoat/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"strings"
"time"

"github.com/labstack/echo/v4"
Expand Down Expand Up @@ -73,7 +74,7 @@ func parseAlertmanagerHookMessage(message HookMessage) error {
var post_service_state ServiceState

post_service_state.UpdateKey = "valid"
post_service_state.Severity = message.CommonLabels["severity"]
post_service_state.Severity = strings.ToLower(message.CommonLabels["severity"])
if message.CommonLabels["severity"] == "" {
err := fmt.Errorf("missing CommonLabels[Severity]")
logger.Error("parseAlertmanagerHookMessage", "CommonLabels", err)
Expand All @@ -98,6 +99,8 @@ func parseAlertmanagerHookMessage(message HookMessage) error {
err := fmt.Errorf("missing CommonLabels['prometheus_cluster'], CommonLabels['cluster'] or CommonLabels['prometheus']")
return err
}
post_service_state.Host = strings.ToLower(post_service_state.Host)

post_service_state.Service = message.CommonLabels["namespace"]
if message.CommonLabels["namespace"] == "" {
logger.Info("parseAlertmanagerHookMessage", "missing", "CommonLabels['namespace']")
Expand Down Expand Up @@ -140,6 +143,8 @@ func parseAlertmanagerHookMessage(message HookMessage) error {
post_service_state = runDependOn(post_service_state)

ss.serviceStateList[host_service] = post_service_state

go updateBuddy(post_service_state, "")
}

return nil
Expand All @@ -165,8 +170,9 @@ func parseAlertmanagerAlert(alert Alert, service_state ServiceState) (ServiceSta
}
if service_state.Service == "" {
logger.Error("parseAlertmanagerAlert", "service", "Cant find namespace or container", "alert object", alert.Labels)
} else {
service_state.Service = strings.ToLower(service_state.Service)
}

return service_state, nil

}
Expand Down
62 changes: 49 additions & 13 deletions cmd/dashgoat/buddy.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,42 @@ type (
}
)

// setStateDown on buddy backlog
func setStateDown(host string, data int64) {
if host == "" {
logger.Info("setStateDown", "error", "no host")
return
}

backlog.mutex.Lock()
defer backlog.mutex.Unlock()
backlog.StateDown[host] = data
}

// getStateDown on buddy backlog
func getStateDown() map[string]int64 {
backlog.mutex.RLock()
defer backlog.mutex.RUnlock()

copy := make(map[string]int64)
for k, v := range backlog.StateDown {
copy[k] = v
}
return copy
}

// setBacklog on buddy backlog
func setBacklog(host string, data []string) {
if host == "" {
logger.Info("setBacklog", "error", "no host")
return
}

backlog.mutex.Lock()
defer backlog.mutex.Unlock()
backlog.buddyBacklog[host] = data
}

// Update Buddies with newly recieved msg
func updateBuddy(event ServiceState, delete string) {
to_update := listBuddies()
Expand All @@ -32,9 +68,7 @@ func updateBuddy(event ServiceState, delete string) {
return //No buddy to tell
}

backlog.mutex.RLock()
buddyDown := backlog.StateDown
backlog.mutex.RUnlock()
buddyDown := getStateDown()

for _, bhost := range to_update {
if !contains(event.From, bhost.Name) {
Expand Down Expand Up @@ -117,7 +151,7 @@ func talkToBuddyApi(event ServiceState, host Buddy, delete string) {
func findBuddy(buddyConfig []Buddy) {

initBuddyConf(buddyConfig)
buddyAmount := len(buddyRunningConfig.Buddies)
buddyAmount := len(listBuddies())

if buddyAmount < 1 {
setDashGoatReady(true)
Expand Down Expand Up @@ -167,29 +201,30 @@ func findBuddy(buddyConfig []Buddy) {

// report back to UI, stausList
func tellBuddyState(host string, up bool, servicehost string) {
var empty_slice []string
var default_int64 int64

now := time.Now()
backlog.mutex.Lock()
defer backlog.mutex.Unlock()

if _, ok := backlog.StateDown[host]; !ok {
backlog.StateDown[host] = 0
setStateDown(host, default_int64)
}

if up {
if backlog.StateDown[host] != 0 {
if getStateDown()[host] != 0 {
tellServiceListAboutBuddy(host, up)
}
backlog.StateDown[host] = 0
setStateDown(host, default_int64)
deliverBacklog(host, backlog.buddyBacklog[host])
backlog.buddyBacklog[host] = nil
setBacklog(host, empty_slice) //empty backlog for host
} else {
if servicehost != "" {
backlog.buddyBacklog[host] = append(backlog.buddyBacklog[host], servicehost)
backlog_tmp := append(backlog.buddyBacklog[host], servicehost)
setBacklog(host, backlog_tmp)
}
if backlog.StateDown[host] == 0 {
tellServiceListAboutBuddy(host, up)
backlog.StateDown[host] = now.Unix()
setStateDown(host, now.Unix())
}
}
}
Expand Down Expand Up @@ -308,7 +343,9 @@ func AskApiFullStatusList(bhost Buddy) error {

for servicehost, status := range resultMap {
if status.Service != "buddy" {
ss.mutex.Lock()
ss.serviceStateList[servicehost] = status
ss.mutex.Unlock()
}
}

Expand Down Expand Up @@ -375,7 +412,6 @@ func tellServiceListAboutBuddy(buddyName string, up bool) {
logger.Error("tellServiceListAboutBuddy", "error", err)
}

// logger.Info("tellServiceListAboutBuddy", "debugger", result)
iSnewState(result)

ss.serviceStateList[serviceName] = result
Expand Down
4 changes: 3 additions & 1 deletion cmd/dashgoat/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ func heartBeat(c echo.Context) error {
result = post_service_state.Host + post_service_state.Service

ss.serviceStateList[result] = post_service_state

go updateBuddy(post_service_state, "")

return c.JSON(http.StatusOK, result)
}

Expand Down Expand Up @@ -237,7 +240,6 @@ func checkUpdatekey(key string) bool {
}

func checkUrnKey(key string) bool {
//logger.Info("Config urn", "key", config.UrnKey)
return key == config.UrnKey
}

Expand Down
5 changes: 5 additions & 0 deletions cmd/dashgoat/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,14 @@ var serviceStateCollector *ServiceStateCollector
func main() {
var configfile string

ss.mutex.Lock()
ss.serviceStateList = make(map[string]ServiceState)
ss.mutex.Unlock()

backlog.mutex.Lock()
backlog.buddyBacklog = make(map[string][]string)
backlog.StateDown = make(map[string]int64)
backlog.mutex.Unlock()

e := echo.New()

Expand Down
10 changes: 6 additions & 4 deletions cmd/dashgoat/pagerduty.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ func shouldPagerDutyTrigger(severity_to_check string) bool {
trigger_level := indexOf(severitys[:], config.PagerdutyConfig.TriggerLevel)
to_check := indexOf(severitys[:], severity_to_check)

logger.Info("shouldPagerDutyTrigger", "severity_to_check", to_check, "trigger_level", trigger_level)
return to_check >= trigger_level

}
Expand Down Expand Up @@ -148,7 +149,7 @@ func (c *PdClient) CompilePdEvent(fromstate string, dgss ServiceState) {

err := pdClient.TellPagerDuty(pdevent)
if err != nil {
logger.Error("Error sending to PagerDuty:", err)
logger.Error("CompilePdEvent", "error", "update was not send")

}
}
Expand Down Expand Up @@ -190,21 +191,22 @@ func (c *PdClient) TellPagerDuty(pdevent PagerDutyEvent) error {

req, err := http.NewRequest("POST", c.config.URL, payload)
if err != nil {
logger.Error("PagerDuty POST failed", err)
logger.Error("TellPagerDuty", "POST failed", err)
return err
}

req.Header.Add("Content-Type", "application/json")

res, err := client.Do(req)
if err != nil {
logger.Error("PagerDuty error client", err)
logger.Error("TellPagerDuty", "Do error", err)
return err
}
defer res.Body.Close()

body, err := io.ReadAll(res.Body)
if err != nil {
logger.Error("Failed reading PagerDuty response", err)
logger.Error("TellPagerDuty", "ReadAll PagerDuty", err)
return err
}

Expand Down
6 changes: 4 additions & 2 deletions cmd/dashgoat/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ func iSnewState(checkss ServiceState) (change string, new_service bool) {

if _, ok := ss.serviceStateList[hostservice]; ok {

current_status := ss.serviceStateList[hostservice].Status

// no change
if ss.serviceStateList[hostservice].Status == checkss.Status {
if current_status == checkss.Status {
return "", false
}

// change
go reportStateChange(ss.serviceStateList[hostservice].Status, checkss)
go reportStateChange(current_status, checkss)
return checkss.Status, false
}

Expand Down
1 change: 1 addition & 0 deletions deploy/azure-functions/.funcignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __azurite_db*__.json
__blobstorage__
__queuestorage__
local.settings.json
prepare-azure-files.sh
build
deploy
doc
Expand Down

0 comments on commit a88a693

Please sign in to comment.