Skip to content

Commit

Permalink
feat(logs): logfmt standard + provisioned slack alerts (#475)
Browse files Browse the repository at this point in the history
  • Loading branch information
0xNineteen authored Jan 8, 2025
1 parent 0fb04c6 commit 62a2da4
Show file tree
Hide file tree
Showing 13 changed files with 436 additions and 65 deletions.
1 change: 1 addition & 0 deletions metrics/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
8 changes: 8 additions & 0 deletions metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ linux: `docker compose down`

`./zig-out/bin/sig gossip -n testnet 2>&1 | tee -a logs/sig.log`

## Setting up Alerts

Set the slack webhook url env variable in a `/metrics/.env` file.

```
SLACK_WEBHOOK_URL=hooks.slack.com/services/AAA/BBB/CCC
```

## Expected result

```
Expand Down
34 changes: 33 additions & 1 deletion metrics/alloy/config.alloy
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,42 @@ local.file_match "local_files" {

loki.source.file "log_scrape" {
targets = local.file_match.local_files.targets
forward_to = [loki.write.grafana_loki.receiver]
forward_to = [loki.process.process_log.receiver]
tail_from_end = true
}

loki.process "process_log" {
stage.logfmt {
mapping = {
"level" = "",
"scope" = "",
"message" = "",
"time" = "",
}
}

stage.timestamp {
source = "time"
format = "2006-01-02T15:04:05.000Z"
}

stage.labels {
values = {
"level" = "",
"scope" = "",
"message" = "",
}
}

stage.static_labels {
values = {
source = "sig",
}
}

forward_to = [loki.write.grafana_loki.receiver]
}

loki.write "grafana_loki" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
Expand Down
4 changes: 3 additions & 1 deletion metrics/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,20 @@ services:
- SIG_PID=${SIG_PID}

grafana:
image: grafana/grafana
image: grafana/grafana:11.4.0
container_name: grafana
ports:
- 3000:3000
restart: unless-stopped
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=grafana
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
volumes:
- ./grafana/datasources:/etc/grafana/provisioning/datasources
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/dashboards:/var/lib/grafana/dashboards
- ./grafana/alerting:/etc/grafana/provisioning/alerting

node-exporter:
image: prom/node-exporter
Expand Down
153 changes: 153 additions & 0 deletions metrics/grafana/alerting/alert_rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
apiVersion: 1
groups:
- orgId: 1
name: sig-alert-evaluation-group
folder: sig-alert-rule
interval: 1m
rules:
- uid: de9ayy0ojq8e8d
title: ram-usage-alert
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: |
((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_SReclaimable_bytes)) / node_memory_MemTotal_bytes) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
annotations:
summary: "sig memory warning: using {{ $values.A.Value }}% of RAM"
labels: {}
isPaused: false
notification_settings:
receiver: slack-sig-alerts

- uid: ce8wwpcp2dkowb
title: log-error-alert
condition: C
data:
- refId: A
queryType: range
relativeTimeRange:
from: 600
to: 0
datasourceUid: P8E80F9AEF21F6940
model:
datasource:
type: loki
uid: P8E80F9AEF21F6940
editorMode: code
expr: count_over_time({level="error"}[5m]) > 0
intervalMs: 1000
maxDataPoints: 43200
queryType: range
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: count
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: jBuN47BVz
panelId: 26
noDataState: NoData
execErrState: Error
for: 0s
annotations:
__dashboardUid__: jBuN47BVz
__panelId__: "26"
description: ""
runbook_url: ""
summary: "error: [{{ $labels.scope }}]: {{ $labels.message }} "
labels:
"": ""
isPaused: false
notification_settings:
receiver: slack-sig-alerts
17 changes: 17 additions & 0 deletions metrics/grafana/alerting/contact_points.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: 1
contactPoints:
- orgId: 1
name: slack-sig-alerts
editable: true
receivers:
- uid: ae8wvbjn9bncxe
type: slack
settings:
recipient: sig
url: https://${SLACK_WEBHOOK_URL}
text: |-
{{ range .Alerts.Firing }}
{{ .Annotations.summary }} -- {{ .PanelURL }}
{{ end }}
title: "{{ len .Alerts.Firing }} Alert(s) Firing"
disableResolveMessage: false
Loading

0 comments on commit 62a2da4

Please sign in to comment.