Skip to content

Commit

Permalink
fix(gossip): consistent scope + less alerts (#484)
Browse files Browse the repository at this point in the history
  • Loading branch information
0xNineteen authored Jan 9, 2025
1 parent 62a2da4 commit 3c6d049
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 28 deletions.
14 changes: 5 additions & 9 deletions metrics/grafana/alerting/alert_rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,11 @@ groups:
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
noDataState: KeepLast
execErrState: Error
for: 1m
annotations:
summary: "sig memory warning: using {{ $values.A.Value }}% of RAM"
labels: {}
isPaused: false
notification_settings:
receiver: slack-sig-alerts
Expand Down Expand Up @@ -137,17 +136,14 @@ groups:
type: threshold
dashboardUid: jBuN47BVz
panelId: 26
noDataState: NoData
noDataState: KeepLast
for: 1m
execErrState: Error
for: 0s
annotations:
__dashboardUid__: jBuN47BVz
__panelId__: "26"
description: ""
runbook_url: ""
summary: "error: [{{ $labels.scope }}]: {{ $labels.message }} "
labels:
"": ""
summary: "error: [{{ $labels.scope }}]: {{ $labels.message }}"
labels: {}
isPaused: false
notification_settings:
receiver: slack-sig-alerts
2 changes: 1 addition & 1 deletion metrics/grafana/alerting/contact_points.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ contactPoints:
url: https://${SLACK_WEBHOOK_URL}
text: |-
{{ range .Alerts.Firing }}
{{ .Annotations.summary }} -- {{ .PanelURL }}
{{ .Annotations.summary }}
{{ end }}
title: "{{ len .Alerts.Firing }} Alert(s) Firing"
disableResolveMessage: false
47 changes: 45 additions & 2 deletions metrics/grafana/dashboards/gossip_metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 2,
"id": 6,
"links": [],
"panels": [
{
Expand Down Expand Up @@ -2007,6 +2007,49 @@
],
"title": "Error Service Logs",
"type": "logs"
},
{
"datasource": {
"default": false,
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 64
},
"id": 31,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.4.0",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "builder",
"expr": "{scope=\"gossip_service\", level=\"warning\"} |= ``",
"queryType": "range",
"refId": "A"
}
],
"title": "Warning Service Logs",
"type": "logs"
}
],
"preload": false,
Expand All @@ -2024,6 +2067,6 @@
"timezone": "",
"title": "Gossip Metrics",
"uid": "jBuN47BVz",
"version": 5,
"version": 2,
"weekStart": ""
}
33 changes: 17 additions & 16 deletions src/gossip/service.zig
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const std = @import("std");
const network = @import("zig-network");
const sig = @import("../sig.zig");
const Bloom = @import("../bloom/bloom.zig").Bloom;

const bincode = sig.bincode;
const socket_utils = sig.net.socket_utils;
Expand All @@ -15,10 +14,10 @@ const KeyPair = std.crypto.sign.Ed25519.KeyPair;
const EndPoint = network.EndPoint;
const UdpSocket = network.Socket;

const Bloom = sig.bloom.Bloom;
const Pubkey = sig.core.Pubkey;
const Hash = sig.core.Hash;
const Logger = sig.trace.log.Logger;
const ScopedLogger = sig.trace.log.ScopedLogger;
const Packet = sig.net.Packet;
const EchoServer = sig.net.echo.Server;
const SocketAddr = sig.net.SocketAddr;
Expand Down Expand Up @@ -169,13 +168,14 @@ pub const GossipService = struct {
thread_pool: ThreadPool,
// TODO: fix when http server is working
// echo_server: EchoServer,
logger: ScopedLogger(LOG_SCOPE),
logger: ScopedLogger,
metrics: GossipMetrics,
service_manager: ServiceManager,

const Self = @This();

pub const LOG_SCOPE = "gossip_service";
pub const ScopedLogger = sig.trace.log.ScopedLogger(LOG_SCOPE);

const Entrypoint = struct { addr: SocketAddr, info: ?ContactInfo = null };

Expand Down Expand Up @@ -450,7 +450,7 @@ pub const GossipService = struct {
gossip_value_allocator: std.mem.Allocator,
packet: Packet,
verified_incoming_channel: *Channel(GossipMessageWithEndpoint),
logger: ScopedLogger(@typeName(VerifyMessageEntry)),
logger: ScopedLogger,

pub fn callback(self: *VerifyMessageEntry) !void {
const packet = self.packet;
Expand All @@ -460,19 +460,19 @@ pub const GossipService = struct {
packet.data[0..packet.size],
bincode.Params.standard,
) catch |e| {
self.logger.err().logf("gossip: packet_verify: failed to deserialize: {s}", .{@errorName(e)});
self.logger.err().logf("packet_verify: failed to deserialize: {s}", .{@errorName(e)});
return;
};

message.sanitize() catch |e| {
self.logger.err().logf("gossip: packet_verify: failed to sanitize: {s}", .{@errorName(e)});
self.logger.err().logf("packet_verify: failed to sanitize: {s}", .{@errorName(e)});
bincode.free(self.gossip_value_allocator, message);
return;
};

message.verifySignature() catch |e| {
self.logger.err().logf(
"gossip: packet_verify: failed to verify signature from {}: {s}",
"packet_verify: failed to verify signature from {}: {s}",
.{ packet.addr, @errorName(e) },
);
bincode.free(self.gossip_value_allocator, message);
Expand Down Expand Up @@ -508,7 +508,7 @@ pub const GossipService = struct {
.gossip_value_allocator = self.gossip_value_allocator,
.verified_incoming_channel = self.verified_incoming_channel,
.packet = undefined,
.logger = self.logger.withScope(@typeName(VerifyMessageEntry)),
.logger = self.logger,
};
}

Expand Down Expand Up @@ -850,7 +850,7 @@ pub const GossipService = struct {
var x_timer = sig.time.Timer.start() catch unreachable;
const now = getWallclockMs();
const n_pubkeys_dropped = gossip_table.attemptTrim(now, UNIQUE_PUBKEY_CAPACITY) catch |err| err_blk: {
self.logger.warn().logf("gossip_table.attemptTrim failed: {s}", .{@errorName(err)});
self.logger.err().logf("gossip_table.attemptTrim failed: {s}", .{@errorName(err)});
break :err_blk 0;
};
const elapsed = x_timer.read().asMillis();
Expand Down Expand Up @@ -1430,7 +1430,7 @@ pub const GossipService = struct {
for (tasks) |*task| {
packet_loop: for (task.output.items) |output| {
self.packet_outgoing_channel.send(output) catch {
self.logger.err().log("failed to send outgoing packet");
self.logger.err().log("handleBatchPullRequest: failed to send outgoing packet");
break :packet_loop;
};
self.metrics.pull_responses_sent.add(1);
Expand Down Expand Up @@ -1844,10 +1844,11 @@ pub const GossipService = struct {
for (self.entrypoints.items) |entrypoint| {
if (entrypoint.info) |info| {
if (info.shred_version != 0) {
self.logger.info().logf(
"shred version: {} - from entrypoint contact info: {s}",
.{ info.shred_version, entrypoint.addr.toString().constSlice() },
);
self.logger.info()
.field("shred_version", info.shred_version)
.field("entrypoint", entrypoint.addr.toString().constSlice())
.log("shred_version_from_entrypoint");

self.my_shred_version.store(info.shred_version, .monotonic);
self.my_contact_info.shred_version = info.shred_version;
return true;
Expand Down Expand Up @@ -2098,7 +2099,7 @@ pub const GossipMetrics = struct {
// logging details
_logging_fields: struct {
// Scoping to GossipService instead of logging fields struct.
logger: ScopedLogger(GossipService.LOG_SCOPE),
logger: GossipService.ScopedLogger,
log_interval_micros: i64 = 10 * std.time.us_per_s,
last_log: i64 = 0,
last_logged_snapshot: StatsToLog = .{},
Expand Down Expand Up @@ -2135,7 +2136,7 @@ pub const GossipMetrics = struct {
5000, 10000,
};

pub fn init(logger: ScopedLogger(GossipService.LOG_SCOPE)) GetMetricError!Self {
pub fn init(logger: GossipService.ScopedLogger) GetMetricError!Self {
var self: Self = undefined;
const registry = globalRegistry();
std.debug.assert(try registry.initFields(&self) == 1);
Expand Down

0 comments on commit 3c6d049

Please sign in to comment.