Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: pod and helm health #142

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 39 additions & 10 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
@@ -52,6 +52,13 @@ func isErrorStatus(s string) bool {
strings.HasSuffix(s, "BackOff")
}

func pluralize(s string, i int) string {
if i == 1 {
return s
}
return s + "s"
}

func getContainerStatus(containerStatus corev1.ContainerStatus) (waiting *HealthStatus, terminated *HealthStatus) {
if state := containerStatus.State.Waiting; state != nil {
waiting = &HealthStatus{
@@ -67,16 +74,37 @@ func getContainerStatus(containerStatus corev1.ContainerStatus) (waiting *Health

if state := containerStatus.LastTerminationState.Terminated; state != nil {
age := time.Since(state.FinishedAt.Time)
// ignore old terminate statuses
if age < time.Hour*24 {
// ignore old terminated statuses

if containerStatus.RestartCount == 0 && state.ExitCode == 0 {
terminated = &HealthStatus{
Status: HealthStatusCompleted,
Health: HealthHealthy,
}
} else if age < time.Hour*24 {
terminated = &HealthStatus{
Status: HealthStatusCode(state.Reason),
Health: lo.Ternary(age < time.Hour, HealthUnhealthy, HealthWarning),
Message: state.Message,
}
if state.Reason == string(HealthStatusCompleted) && state.ExitCode == 0 {
// completed successfully
terminated.Health = HealthHealthy

if containerStatus.RestartCount > 0 {
terminated.AppendMessage("restarted %d %s", containerStatus.RestartCount, pluralize("time", int(containerStatus.RestartCount)))
}
if state.Reason == string(HealthStatusError) {
if age < 15*time.Minute {
terminated.Status = HealthStatusCrashLoopBackoff
} else if age < time.Hour {
terminated.Status = "Restarted"
}
} else if state.Reason == string(HealthStatusCompleted) && state.ExitCode != 0 {
// completed with error
terminated.Status = HealthStatusCode(state.Reason)
terminated.AppendMessage("exit=%d", state.ExitCode)
} else if state.Reason == string(HealthStatusCompleted) {
// completed with restart
terminated.Status = "RestartLoop"
} else {
terminated.Status = HealthStatusCode(state.Reason)
}
}
}
@@ -149,10 +177,11 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {

case corev1.PodRunning, corev1.PodPending:
hr = hr.Merge(terminated, waiting)
if terminated != nil && terminated.Health.IsWorseThan(HealthWarning) &&
hr.Status == HealthStatusCrashLoopBackoff {
hr.Status = terminated.Status
hr.Health = hr.Health.Worst(terminated.Health)
if terminated != nil && terminated.Health.IsWorseThan(HealthWarning) {
if hr.Status == HealthStatusCrashLoopBackoff {
hr.Status = terminated.Status
hr.Health = hr.Health.Worst(terminated.Health)
}
}
hr.Status, _ = lo.Coalesce(hr.Status, HealthStatusRunning)
hr.Health = hr.Health.Worst(lo.Ternary(isReady, HealthHealthy, HealthUnhealthy))
31 changes: 28 additions & 3 deletions pkg/health/status.go
Original file line number Diff line number Diff line change
@@ -179,7 +179,13 @@ func (mapped *Condition) Apply(health *HealthStatus, c *metav1.Condition) {
}
}

type Filter struct {
OnCondition `yaml:",inline" json:",inline,omitempty"`

Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"`
}
type StatusMap struct {
Filters []Filter `yaml:"filters" json:"filters"`
Conditions map[string]Condition `yaml:"conditions" json:"conditions"`
UnhealthyIsNotReady bool `yaml:"unhealthyIsNotReady" json:"unhealthyIsNotReady"`
}
@@ -199,15 +205,34 @@ func GetDefaultHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
kind = "cnrm.cloud.google.com"
}
if statusMap, ok := statusByKind[obj.GetAPIVersion()+"/"+obj.GetKind()]; ok {
return GetHealthFromStatus(GetGenericStatus(obj), statusMap)
return GetHealth(obj, statusMap)
} else if statusMap, ok := statusByKind[kind]; ok {
return GetHealthFromStatus(GetGenericStatus(obj), statusMap)
return GetHealth(obj, statusMap)
} else {
return GetHealthFromStatus(GetGenericStatus(obj), statusByKind["default"])
return GetHealth(obj, statusByKind["default"])
}
}

func GetHealth(obj *unstructured.Unstructured, statusMap StatusMap) (*HealthStatus, error) {
if len(statusMap.Filters) > 0 {
for _, f := range statusMap.Filters {
allGot := true
for k, v := range f.Match {
got, _, _ := unstructured.NestedString(obj.Object, strings.Split(k, ".")...)
if got != v {
allGot = false
continue
}
}
if allGot {
health := &HealthStatus{
Health: HealthUnknown,
}
f.OnCondition.Apply(health, &metav1.Condition{})
return health, nil
}
}
}
return GetHealthFromStatus(GetGenericStatus(obj), statusMap)
}

6 changes: 6 additions & 0 deletions pkg/health/statusMap.yaml
Original file line number Diff line number Diff line change
@@ -106,6 +106,12 @@ HelmRelease: &flux
order: 1

HelmRepository: &flux
filters:
# OCI HelmRepositories do not have status info
- match:
spec.type: oci
ready: true
health: unknown
conditions:
Reconciling:
status: Reconciling
44 changes: 44 additions & 0 deletions pkg/health/testdata/Kubernetes/HelmRepository/healthy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
annotations:
expected-ready: true
expected-status: Succeeded
creationTimestamp: "2024-07-12T10:41:33Z"
finalizers:
- finalizers.fluxcd.io
generation: 1
labels:
kustomize.toolkit.fluxcd.io/name: aws-demo-infra
kustomize.toolkit.fluxcd.io/namespace: flux-system
name: argo
namespace: argo
resourceVersion: "371712638"
uid: 1925e5d0-37b1-412f-a33c-9411c51d4236
spec:
interval: 1m
provider: generic
url: https://argoproj.github.io/argo-helm
status:
artifact:
digest: sha256:52fc912eed981927098a128e32ec31db642af7a8cc21341b120d7284d62eea47
lastUpdateTime: "2024-11-22T15:21:35Z"
path: helmrepository/argo/argo/index-3bda2a43c97097788abae208248537a2a08dc73f175821dfb4415509db222c43.yaml
revision: sha256:3bda2a43c97097788abae208248537a2a08dc73f175821dfb4415509db222c43
size: 1639422
url: http://source-controller.flux-system.svc.cluster.local./helmrepository/argo/argo/index-3bda2a43c97097788abae208248537a2a08dc73f175821dfb4415509db222c43.yaml
conditions:
- lastTransitionTime: "2024-11-22T15:21:35Z"
message: 'stored artifact: revision ''sha256:3bda2a43c97097788abae208248537a2a08dc73f175821dfb4415509db222c43'''
observedGeneration: 1
reason: Succeeded
status: "True"
type: Ready
- lastTransitionTime: "2024-11-22T15:21:35Z"
message: 'stored artifact: revision ''sha256:3bda2a43c97097788abae208248537a2a08dc73f175821dfb4415509db222c43'''
observedGeneration: 1
reason: Succeeded
status: "True"
type: ArtifactInStorage
observedGeneration: 1
url: http://source-controller.flux-system.svc.cluster.local./helmrepository/argo/argo/index.yaml
21 changes: 21 additions & 0 deletions pkg/health/testdata/Kubernetes/HelmRepository/unknown.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
annotations:
expected-ready: "true"
creationTimestamp: "2024-11-25T10:53:37Z"
generation: 1
labels:
canaries.flanksource.com/canary-id: ""
canaries.flanksource.com/check-id: ""
canaries.flanksource.com/is-static: "true"
name: bitnami
namespace: control-plane-tests
resourceVersion: "373756700"
uid: 9197ff09-cd5c-42b5-a6d6-321132aeaf43
spec:
interval: 1h
provider: generic
type: oci
url: oci://registry-1.docker.io/bitnamicharts
status: {}
180 changes: 180 additions & 0 deletions pkg/health/testdata/Kubernetes/Pod/early-failures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
apiVersion: v1
kind: Pod
metadata:
uid: 26361045-4fc3-4c59-8f7c-8f06a79b53d2
name: canary-checker-6985458cf7-45l8t
labels:
control-plane: canary-checker
pod-template-hash: 6985458cf7
app.kubernetes.io/name: canary-checker
app.kubernetes.io/instance: mission-control
annotations:
expected-status: Running
expected-health: warning
expected-message: restarted 2 times
namespace: mission-control
generateName: canary-checker-6985458cf7-
ownerReferences:
- uid: 0ce8f26a-5092-411e-a8b9-7a60a24ed6a5
kind: ReplicaSet
name: canary-checker-6985458cf7
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: "@now-4h"
spec:
volumes:
- name: podinfo
downwardAPI:
items:
- path: labels
fieldRef:
fieldPath: metadata.labels
apiVersion: v1
defaultMode: 420
- name: config
configMap:
name: canary-checker
defaultMode: 420
- name: kube-api-access-dwgfj
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
nodeName: ip-10-0-6-119.eu-west-1.compute.internal
priority: 0
dnsPolicy: ClusterFirst
containers:
- env:
- name: PING_MODE
value: unprivileged
- name: DEBUG
value: "true"
- name: DB_URL
valueFrom:
secretKeyRef:
key: DB_URL
name: incident-commander-postgres
args:
- operator
- --httpPort
- "8080"
- --disable-postgrest=true
- --db-migrations=false
- --json-logs
- --prometheus=http://prometheus.monitoring.svc:9090
- --otel-collector-url=grafana-alloy.monitoring:4317
- --otel-service-name=canary-checker
name: canary-checker
image: public.ecr.aws/flanksource/canary-checker-full:v1.1.0-beta.82
command:
- /app/canary-checker
resources:
limits:
memory: 2Gi
requests:
cpu: 200m
memory: 200Mi
volumeMounts:
- name: podinfo
mountPath: /etc/podinfo
- name: config
subPath: canary-checker.properties
mountPath: /app/canary-checker.properties
- name: kube-api-access-dwgfj
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
periodSeconds: 10
timeoutSeconds: 1
failureThreshold: 3
successThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
periodSeconds: 10
timeoutSeconds: 1
failureThreshold: 3
successThreshold: 1
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: canary-checker-sa
securityContext:
fsGroup: 1000
sysctls:
- name: net.ipv4.ping_group_range
value: 0 2147483647
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: true
serviceAccountName: canary-checker-sa
terminationGracePeriodSeconds: 30
status:
phase: Running
podIP: 10.0.6.105
hostIP: 10.0.6.119
podIPs:
- ip: 10.0.6.105
hostIPs:
- ip: 10.0.6.119
qosClass: Burstable
startTime: 2024-11-25T11:38:52Z
conditions:
- type: ContainersReady
status: "True"
- type: Initialized
status: "True"
- type: PodReadyToStartContainers
status: "True"
- type: PodScheduled
status: "True"
- type: Ready
status: "True"
containerStatuses:
- name: canary-checker
image: public.ecr.aws/flanksource/canary-checker-full:v1.1.0-beta.82
ready: true
state:
running:
startedAt: "@now-4h"
imageID: public.ecr.aws/flanksource/canary-checker-full@sha256:4ab67ba4b7645095b0f82decb31dc55c9bcbec124132df8e64b076893cc917a3
started: true
lastState:
terminated:
reason: Error
exitCode: 2
startedAt: 2024-11-25T11:39:03Z
finishedAt: "@now-4h"
containerID: containerd://a7dc7e988a145518b89aac1a7a8147a6504bb7c8dd2ba1ccb325839fa5a7ffcb
containerID: containerd://ef58d4532a35546dd5d47de2846b5ef08efb0bcc834c8d448ec45fbb321d9073
restartCount: 2
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ metadata:
annotations:
expected-status: OOMKilled
expected-health: unhealthy
expected-message: system has run out of memory
expected-message: system has run out of memory, restarted 9 times
creationTimestamp: 2024-11-20T06:57:31Z
spec:
volumes:
Loading