Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add health check endpoint #665

Merged
merged 11 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ jobs:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
with:
fetch-depth: 0
- uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
with:
go-version-file: 'go.mod'
Expand Down Expand Up @@ -144,6 +146,8 @@ jobs:
version: "bookworm-slim"
steps:
- uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
with:
fetch-depth: 0
- uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
with:
go-version-file: 'go.mod'
Expand Down
61 changes: 61 additions & 0 deletions docs/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,23 @@
"info": {},
"host": "localhost:8081",
"paths": {
"/health": {
"get": {
"tags": [
"nginx-agent"
],
"summary": "Check the health of the NGINX Agent",
"operationId": "health-check",
"responses": {
"200": {
"description": "HealthResponse",
"schema": {
"$ref": "#/definitions/HealthResponse"
}
}
}
}
},
"/metrics/": {
"get": {
"description": "# Returns prometheus metrics",
Expand Down Expand Up @@ -127,6 +144,12 @@
"schema": {
"$ref": "#/definitions/AgentAPIConfigApplyStatusResponse"
}
},
"500": {
"description": "AgentAPICommonResponse",
"schema": {
"$ref": "#/definitions/AgentAPICommonResponse"
}
}
}
}
Expand Down Expand Up @@ -195,6 +218,44 @@
},
"x-go-package": "github.com/nginx/agent/v2/src/plugins"
},
"HealthResponse": {
"type": "object",
"properties": {
"checks": {
"description": "Array of health checks",
"type": "array",
dhurley marked this conversation as resolved.
Show resolved Hide resolved
"items": {
"$ref": "#/definitions/HealthStatusCheck"
},
"x-go-name": "Checks"
},
"status": {
"description": "Overall health status",
"type": "string",
"x-go-name": "Status",
"example": "OK"
}
},
"x-go-package": "github.com/nginx/agent/v2/src/plugins"
},
"HealthStatusCheck": {
"type": "object",
"properties": {
"name": {
"description": "Health check name",
"type": "string",
"x-go-name": "Name",
"example": "commandServiceConnection"
dhurley marked this conversation as resolved.
Show resolved Hide resolved
},
"status": {
"description": "Health check status",
"type": "string",
"x-go-name": "Status",
"example": "OK"
}
},
"x-go-package": "github.com/nginx/agent/v2/src/plugins"
},
"NginxDetails": {
"type": "object",
"properties": {
Expand Down
5 changes: 1 addition & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,7 @@ func main() {
controller, commander, reporter := core.CreateGrpcClients(ctx, loadedConfig)

if controller != nil {
if err := controller.Connect(); err != nil {
log.Warnf("Unable to connect to control plane: %v", err)
return
}
go controller.Connect()
}

binary := core.NewNginxBinary(env, loadedConfig)
Expand Down
2 changes: 1 addition & 1 deletion sdk/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ type (
WithClient(Client) Controller
Context() context.Context
WithContext(context.Context) Controller
Connect() error
Connect()
Close() error
}
)
10 changes: 10 additions & 0 deletions sdk/client/commander.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ package client
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"sync"
Expand Down Expand Up @@ -74,11 +75,15 @@ func (c *commander) Connect(ctx context.Context) error {
log.Debugf("Commander connecting to %s", c.server)

c.ctx = ctx

c.retryLock.Lock()
err := backoff.WaitUntil(
c.ctx,
c.backoffSettings,
c.createClient,
)
c.retryLock.Unlock()

if err != nil {
return err
}
Expand Down Expand Up @@ -163,6 +168,11 @@ func (c *commander) Send(ctx context.Context, message Message) error {
return err
}

if c.channel == nil {
c.setIsRetrying(true)
return c.handleGrpcError("Commander Channel Send", errors.New("command channel client not created yet"))
}

if err := c.channel.Send(cmd); err != nil {
c.setIsRetrying(true)
return c.handleGrpcError("Commander Channel Send", err)
Expand Down
13 changes: 4 additions & 9 deletions sdk/client/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ package client
import (
"context"
"fmt"

log "github.com/sirupsen/logrus"
)

func NewClientController() Controller {
Expand All @@ -33,19 +35,12 @@ func (c *ctrl) WithContext(ctx context.Context) Controller {
return c
}

func (c *ctrl) Connect() error {
var retErr error
func (c *ctrl) Connect() {
for _, client := range c.clients {
if err := client.Connect(c.ctx); err != nil {
if retErr == nil {
retErr = fmt.Errorf("%s failed to connect: %w", client.Server(), err)
} else {
retErr = fmt.Errorf("%v\n%s failed to connect: %w", retErr, client.Server(), err)
}
log.Warnf("%s failed to connect: %v", client.Server(), err)
}
}

return retErr
}

func (c *ctrl) Close() error {
Expand Down
6 changes: 2 additions & 4 deletions sdk/client/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ func TestControllerConnect(t *testing.T) {
controller.WithClient(metricsReportClient)
controller.WithContext(ctx)

err := controller.Connect()
assert.Nil(t, err)
controller.Connect()

commanderClient.AssertNumberOfCalls(t, "Connect", 1)
metricsReportClient.AssertNumberOfCalls(t, "Connect", 1)
Expand Down Expand Up @@ -75,8 +74,7 @@ func TestControllerConnect_error(t *testing.T) {
controller.WithClient(metricsReportClient)
controller.WithContext(ctx)

err := controller.Connect()
assert.NotNil(t, err)
controller.Connect()

commanderClient.AssertNumberOfCalls(t, "Connect", 1)
metricsReportClient.AssertNumberOfCalls(t, "Connect", 1)
Expand Down
55 changes: 37 additions & 18 deletions sdk/client/metric_reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package client

import (
"context"
"errors"
"fmt"
"io"
"sync"
Expand All @@ -28,6 +29,7 @@ func NewMetricReporterClient() MetricReporter {
return &metricReporter{
connector: newConnector(),
backoffSettings: DefaultBackoffSettings,
isRetrying: false,
}
}

Expand All @@ -39,6 +41,8 @@ type metricReporter struct {
ctx context.Context
mu sync.Mutex
backoffSettings backoff.BackoffSettings
isRetrying bool
retryLock sync.Mutex
}

func (r *metricReporter) WithInterceptor(interceptor interceptors.Interceptor) Client {
Expand All @@ -57,11 +61,14 @@ func (r *metricReporter) Connect(ctx context.Context) error {
log.Debugf("Metric Reporter connecting to %s", r.server)

r.ctx = ctx

r.retryLock.Lock()
err := backoff.WaitUntil(
r.ctx,
r.backoffSettings,
r.createClient,
)
r.retryLock.Unlock()
if err != nil {
return err
}
Expand Down Expand Up @@ -151,18 +158,19 @@ func (r *metricReporter) Send(ctx context.Context, message Message) error {
return fmt.Errorf("MetricReporter expected a metrics report message, but received %T", message.Data())
}

isRetrying := false

err = backoff.WaitUntil(r.ctx, r.backoffSettings, func() error {
if isRetrying {
log.Infof("Metric Reporter Channel Send: retrying to connect to %s", r.grpc.Target())
err := r.createClient()
if err != nil {
return err
}
err := r.checkClientConnection()
if err != nil {
return err
}

if r.channel == nil {
r.isRetrying = true
return r.handleGrpcError("Metric Reporter Channel Send", errors.New("metric service stream client not created yet"))
}

if err := r.channel.Send(report); err != nil {
isRetrying = true
r.isRetrying = true
return r.handleGrpcError("Metric Reporter Channel Send", err)
}

Expand All @@ -176,18 +184,14 @@ func (r *metricReporter) Send(ctx context.Context, message Message) error {
return fmt.Errorf("MetricReporter expected an events report message, but received %T", message.Data())
}

isRetrying := false

err = backoff.WaitUntil(r.ctx, r.backoffSettings, func() error {
if isRetrying {
log.Infof("Metric Reporter Channel Send: retrying to connect to %s", r.grpc.Target())
err = r.createClient()
if err != nil {
return err
}
err := r.checkClientConnection()
if err != nil {
return err
}

if err := r.eventsChannel.Send(report); err != nil {
isRetrying = true
r.isRetrying = true
return r.handleGrpcError("Metric Reporter Events Channel Send", err)
}

Expand All @@ -202,6 +206,21 @@ func (r *metricReporter) Send(ctx context.Context, message Message) error {
return err
}

func (r *metricReporter) checkClientConnection() error {
r.retryLock.Lock()
defer r.retryLock.Unlock()

if r.isRetrying {
log.Infof("Metric Reporter Channel Send: retrying to connect to %s", r.grpc.Target())
err := r.createClient()
if err != nil {
return err
}
}

return nil
}

func (r *metricReporter) closeConnection() error {
var err error
if r.channel != nil {
Expand Down
3 changes: 2 additions & 1 deletion src/core/topics.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ package core
const (
UNKNOWN = "unknown"
RegistrationPrefix = "registration."
RegistrationCompletedTopic = RegistrationPrefix + "completed"
CommNginxConfig = "nginx.config"
NginxConfigUpload = "nginx.config.upload"
NginxReload = "nginx.reload"
Expand Down Expand Up @@ -49,4 +48,6 @@ const (
EnableExtension = "enable.extension"
EnableFeature = "enable.feature"
AgentAPIConfigApplyResponse = "agent.api.config.apply.response"
CommandSent = "command.sent"
MetricReportSent = "metrics.report.sent"
)
Loading
Loading