Skip to content

Commit

Permalink
fix(graceful-shutdown): implement configurable sutdown delay
Browse files Browse the repository at this point in the history
prevents in-flight requests to land on a pod that has already stopped.
  • Loading branch information
clementnuss authored Dec 14, 2023
2 parents 0196f2d + 461bda5 commit e5c13c8
Show file tree
Hide file tree
Showing 11 changed files with 44 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci-helm-deploy-nginx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl -n kube-system wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready
kubectl -n kube-system get pods -l app.kubernetes.io/name=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-helm-deploy-traefik.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl -n kube-system wait pods -l app=kubenurse --for=condition=Ready
kubectl -n kube-system get pods -l app=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-kustomize-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready
kubectl get pods -l app.kubernetes.io/name=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- uses: actions/checkout@v4
- uses: golangci/golangci-lint-action@v3
with:
version: v1.52
version: v1.55
args: --timeout 5m
lint-helm:
runs-on: ubuntu-latest
Expand Down
8 changes: 2 additions & 6 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@ linters-settings:
- gocognit
- funlen
- gocyclo

linters:
disable-all: true
enable:
- bodyclose
- deadcode
- depguard
- dogsled
- dupl
- errcheck
Expand All @@ -71,19 +68,18 @@ linters:
- misspell
- nakedret
- prealloc
- protogetter
- rowserrcheck
- exportloopref
- staticcheck
- structcheck
- stylecheck
- sqlclosecheck
- typecheck
- unconvert
- unparam
- unused
- varcheck
- whitespace
- wsl
issues:
exclude:
# Very commonly not checked.
- 'Error return value of .(l.Sync|.*Close|.*.Write|.*Flush|os\.Remove(All)?|os\.(Un)?Setenv). is not checked'
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM alpine:latest
MAINTAINER OpenSource PF <[email protected]>
LABEL OpenSource="PF <[email protected]>"

RUN apk --no-cache add ca-certificates curl
COPY kubenurse /bin/kubenurse
Expand Down
7 changes: 5 additions & 2 deletions internal/kubediscovery/kubediscovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"context"
"fmt"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -35,7 +36,8 @@ type Neighbour struct {
HostIP string
NodeName string
NodeSchedulable NodeSchedulability
Phase string // Pod Phase
Phase v1.PodPhase
Terminating bool
}

// New creates a new kubediscovery client. The context is used to stop the k8s watchers/informers.
Expand Down Expand Up @@ -92,8 +94,9 @@ func (c *Client) GetNeighbours(ctx context.Context, namespace, labelSelector str
PodName: pod.Name,
PodIP: pod.Status.PodIP,
HostIP: pod.Status.HostIP,
Phase: string(pod.Status.Phase),
Phase: pod.Status.Phase,
NodeName: pod.Spec.NodeName,
Terminating: pod.DeletionTimestamp != nil,
NodeSchedulable: sched,
}
neighbours[idx] = n
Expand Down
19 changes: 19 additions & 0 deletions internal/kubenurse/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ type Server struct {
// * KUBERNETES_SERVICE_PORT
// * KUBENURSE_NAMESPACE
// * KUBENURSE_NEIGHBOUR_FILTER
// * KUBENURSE_SHUTDOWN_DURATION
// * KUBENURSE_CHECK_API_SERVER_DIRECT
// * KUBENURSE_CHECK_API_SERVER_DNS
// * KUBENURSE_CHECK_ME_INGRESS
Expand Down Expand Up @@ -107,12 +108,24 @@ func New(ctx context.Context, k8s kubernetes.Interface) (*Server, error) {
return nil, err
}

shutdownDuration := 5 * time.Second

if v, ok := os.LookupEnv("KUBENURSE_SHUTDOWN_DURATION"); ok {
var err error
shutdownDuration, err = time.ParseDuration(v)

if err != nil {
return nil, err
}
}

chk.KubenurseIngressURL = os.Getenv("KUBENURSE_INGRESS_URL")
chk.KubenurseServiceURL = os.Getenv("KUBENURSE_SERVICE_URL")
chk.KubernetesServiceHost = os.Getenv("KUBERNETES_SERVICE_HOST")
chk.KubernetesServicePort = os.Getenv("KUBERNETES_SERVICE_PORT")
chk.KubenurseNamespace = os.Getenv("KUBENURSE_NAMESPACE")
chk.NeighbourFilter = os.Getenv("KUBENURSE_NEIGHBOUR_FILTER")
chk.ShutdownDuration = shutdownDuration

//nolint:goconst // No need to make "false" a constant in my opinion, readability is better like this.
chk.SkipCheckAPIServerDirect = os.Getenv("KUBENURSE_CHECK_API_SERVER_DIRECT") == "false"
Expand Down Expand Up @@ -198,6 +211,12 @@ func (s *Server) Shutdown(ctx context.Context) error {
s.ready = false
s.mu.Unlock()

// wait before actually shutting down the http/s server, as the updated
// endpoints for the kubenurse service might not have propagated everywhere
// (other kubenurse/ingress controller) yet, which will lead to
// me_ingress or path errors in other pods
time.Sleep(s.checker.ShutdownDuration)

// stop the scheduled checker
s.checker.StopScheduled()

Expand Down
9 changes: 6 additions & 3 deletions internal/servicecheck/servicecheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/postfinance/kubenurse/internal/kubediscovery"
"github.com/prometheus/client_golang/prometheus"
v1 "k8s.io/api/core/v1"
)

const (
Expand Down Expand Up @@ -170,7 +171,7 @@ func (c *Checker) MeIngress() (string, error) {
return skippedStr, nil
}

return c.doRequest(c.KubenurseIngressURL + "/alwayshappy")
return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") //nolint:goconst // readability
}

// MeService checks if the kubenurse is reachable at the /alwayshappy endpoint through the kubernetes service
Expand All @@ -186,8 +187,10 @@ func (c *Checker) MeService() (string, error) {
// which are not schedulable are excluded from this check to avoid possible false errors.
func (c *Checker) checkNeighbours(nh []kubediscovery.Neighbour) {
for _, neighbour := range nh {
neighbour := neighbour // pin
if c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable {
neighbour := neighbour // pin
if neighbour.Phase == v1.PodRunning && // only query running pods (excludes pending ones)
!neighbour.Terminating && // exclude terminating pods
(c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable) {
check := func() (string, error) {
if c.UseTLS {
return c.doRequest("https://" + neighbour.PodIP + ":8443/alwayshappy")
Expand Down
2 changes: 1 addition & 1 deletion internal/servicecheck/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func generateRoundTripper(extraCA string, insecure bool) (http.RoundTripper, err

// Append extra CA, if set
if extraCA != "" {
caCert, err := os.ReadFile(extraCA) //nolint:gosec // Intentionally included by the user.
caCert, err := os.ReadFile(extraCA) // Intentionally included by the user.
if err != nil {
return nil, fmt.Errorf("could not load certificate %s: %w", extraCA, err)
}
Expand Down
3 changes: 3 additions & 0 deletions internal/servicecheck/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ type Checker struct {
SkipCheckMeIngress bool
SkipCheckMeService bool

// shutdownDuration defines the time during which kubenurse will wait before stopping
ShutdownDuration time.Duration

// Kubernetes API
KubernetesServiceHost string
KubernetesServicePort string
Expand Down

0 comments on commit e5c13c8

Please sign in to comment.