Skip to content
This repository has been archived by the owner on Jan 2, 2025. It is now read-only.

Commit

Permalink
Flush out Pagerduty Severity functionality (#61)
Browse files Browse the repository at this point in the history
* Add NodeDownSeverity

Signed-off-by: Dylan Schultz <[email protected]>

* Update severities

Signed-off-by: Dylan Schultz <[email protected]>

* Update config

Signed-off-by: Dylan Schultz <[email protected]>

* Update example

Signed-off-by: Dylan Schultz <[email protected]>

* Update priority name

Signed-off-by: Dylan Schultz <[email protected]>

---------

Signed-off-by: Dylan Schultz <[email protected]>
  • Loading branch information
dylanschultzie authored May 11, 2023
1 parent 9410c7a commit a9f1ace
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
6 changes: 4 additions & 2 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ listen_port: 8888
hide_logs: no
# How long to wait before alerting that a node is down.
node_down_alert_minutes: 3
# Node Down alert Pagerduty Severity
node_down_alert_severity: critical

# Should the prometheus exporter be enabled?
prometheus_enabled: yes
Expand Down Expand Up @@ -74,15 +76,15 @@ chains:
consecutive_enabled: yes
# How many missed blocks should trigger a notification?
consecutive_missed: 5
# NOT USED: future hint for pagerduty's routing
# Consecutive Missed alert Pagerduty Severity
consecutive_priority: critical

# For each chain there is a specific window of blocks and a percentage of missed blocks that will result in
# a downtime jail infraction. Should an alert be sent if a certain percentage of this window is exceeded?
percentage_enabled: no
# What percentage should trigger the alert
percentage_missed: 10
# Not used yet, pagerduty routing hint
# Percentage Missed alert Pagerduty Severity
percentage_priority: warning

# Should an alert be sent if the validator is not in the active set ie, jailed,
Expand Down
31 changes: 16 additions & 15 deletions td2/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
"log"
"net/http"
"strings"
"sync"
"time"

"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
)

type alertMsg struct {
Expand All @@ -34,8 +35,8 @@ type alertMsg struct {
discHook string
discMentions string

slkHook string
slkMentions string
slkHook string
slkMentions string
}

type notifyDest uint8
Expand Down Expand Up @@ -206,9 +207,9 @@ func buildSlackMessage(msg *alertMsg) *SlackMessage {
return &SlackMessage{
Text: msg.message,
Attachments: []Attachment{
Attachment{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
},
},
}
Expand Down Expand Up @@ -481,7 +482,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("stalled: have not seen a new block on %s in %d minutes", cc.ChainId, cc.Alerts.Stalled),
"critical",
"info",
true,
&cc.valInfo.Valcons,
)
Expand Down Expand Up @@ -525,7 +526,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
cc.Alerts.ConsecutivePriority,
false,
&id,
)
Expand All @@ -537,7 +538,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
"info",
true,
&id,
)
Expand All @@ -552,7 +553,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
cc.Alerts.PercentagePriority,
false,
&id,
)
Expand All @@ -564,7 +565,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
"info",
false,
&id,
)
Expand All @@ -585,8 +586,8 @@ func (cc *ChainConfig) watch() {
nodeAlarms[node.Url] = true // used to keep active alert count correct
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
"critical",
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
td.NodeDownSeverity,
false,
&node.Url,
)
Expand All @@ -596,7 +597,7 @@ func (cc *ChainConfig) watch() {
node.wasDown = false
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
"info",
true,
&node.Url,
Expand Down
2 changes: 2 additions & 0 deletions td2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type Config struct {
// NodeDownMin controls how long we wait before sending an alert that a node is not responding or has
// fallen behind.
NodeDownMin int `yaml:"node_down_alert_minutes"`
// NodeDownSeverity controls the Pagerduty severity when notifying if a node is down.
NodeDownSeverity string `yaml:"node_down_alert_severity"`

// Prom controls if the prometheus exporter is enabled.
Prom bool `yaml:"prometheus_enabled"`
Expand Down

0 comments on commit a9f1ace

Please sign in to comment.