Skip to content

Commit

Permalink
fix: check init container status when pod is pending
Browse files Browse the repository at this point in the history
  • Loading branch information
adityathebe authored and moshloop committed Jun 20, 2024
1 parent d225976 commit cba9d80
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 8 deletions.
36 changes: 28 additions & 8 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
}
}

getCommonContainerError := func(containerStatus *corev1.ContainerStatus) *HealthStatus {
waiting := containerStatus.State.Waiting
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
return &HealthStatus{
Status: HealthStatusCode(waiting.Reason),
Health: HealthUnhealthy,
Message: waiting.Message,
}
}

return nil
}

// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
Expand All @@ -54,12 +68,10 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
var messages []string

for _, containerStatus := range pod.Status.ContainerStatuses {
waiting := containerStatus.State.Waiting
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
health = HealthUnhealthy
status = HealthStatusCode(waiting.Reason)
messages = append(messages, waiting.Message)
if msg := getCommonContainerError(&containerStatus); msg != nil {
health = msg.Health
status = msg.Status
messages = append(messages, msg.Message)
}
}

Expand Down Expand Up @@ -89,18 +101,26 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {

switch pod.Status.Phase {
case corev1.PodPending:
for _, ctrStatus := range pod.Status.InitContainerStatuses {
if msg := getCommonContainerError(&ctrStatus); msg != nil {
return msg, nil
}
}

return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusPending,
Message: pod.Status.Message,
}, nil

case corev1.PodSucceeded:
return &HealthStatus{
Health: HealthHealthy,
Status: HealthStatusCompleted,
Ready: true,
Message: pod.Status.Message,
}, nil

case corev1.PodFailed:
if pod.Status.Message != "" {
// Pod has a nice error message. Use that.
Expand All @@ -113,6 +133,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
}

return &HealthStatus{Health: HealthUnhealthy, Status: HealthStatusError, Message: "", Ready: true}, nil

case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
Expand Down Expand Up @@ -142,8 +163,8 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
Status: HealthStatusStarting,
Message: pod.Status.Message,
}, nil
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:

case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
if isReady {
return &HealthStatus{
Health: HealthHealthy,
Expand All @@ -155,7 +176,6 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
Status: HealthStatusRunning,
}, nil
}

}
}

Expand Down
1 change: 1 addition & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-deletion.yaml", health.HealthStatusTerminating, health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
}

// func TestAPIService(t *testing.T) {
Expand Down
171 changes: 171 additions & 0 deletions pkg/health/testdata/pod-init-container-fail.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
apiVersion: v1
kind: Pod
metadata:
uid: d21e1521-5a3f-4120-a446-bd7426199a20
name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
labels:
app: postgresql
controller-revision-hash: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-988f9fc65
statefulset.kubernetes.io/pod-name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
namespace: httpbin
generateName: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-
ownerReferences:
- uid: da460101-eebb-4d4f-b8f4-acb8908d7083
kind: StatefulSet
name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-06-18T14:48:55Z
spec:
volumes:
- name: postgresql
persistentVolumeClaim:
claimName: postgresql-postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 256Mi
- name: logs
emptyDir: {}
- name: kube-api-access-6qtdz
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
hostname: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
nodeName: ip-10-0-6-40.eu-west-1.compute.internal
priority: 0
dnsPolicy: ClusterFirst
subdomain: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc
containers:
- env:
- name: PGDATA
value: /var/lib/postgresql/data
- name: POSTGRES_DB
value: mission_control
name: postgresql
image: postgres:14
resources:
requests:
memory: 4Gi
limits:
memory: 5Gi
cpu: 2
volumeMounts:
- name: dshm
mountPath: /dev/shm
- name: postgresql
subPath: postgres
mountPath: /var/lib/postgresql/data
- name: logs
mountPath: /var/log/postgresql
- name: kube-api-access-6qtdz
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
initContainers:
- name: postgres-perms
image: busybox
command:
- sh
- -c
- mkdir -p /postgres \u0026\u0026 chmod -R 0750 /postgres \u0026\u0026
chown 999:999 -R /postgres \u0026\u0026 chmod -R 777 /dev/shm
resources: {}
volumeMounts:
- name: postgresql
mountPath: /postgres
- name: dshm
mountPath: /dev/shm
- name: kube-api-access-6qtdz
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
serviceAccount: default
securityContext: {}
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: true
serviceAccountName: default
terminationGracePeriodSeconds: 30
status:
phase: Pending
podIP: 10.0.6.127
hostIP: 10.0.6.40
podIPs:
- ip: 10.0.6.127
qosClass: Burstable
startTime: 2024-06-18T14:48:59Z
conditions:
- type: Initialized
reason: ContainersNotInitialized
status: "False"
message: "containers with incomplete status: [postgres-perms]"
- type: Ready
reason: ContainersNotReady
status: "False"
message: "containers with unready status: [postgresql]"
- type: ContainersReady
reason: ContainersNotReady
status: "False"
message: "containers with unready status: [postgresql]"
- type: PodScheduled
status: "True"
containerStatuses:
- name: postgresql
image: postgres:14
ready: false
state:
waiting:
reason: PodInitializing
imageID: ""
started: false
lastState: {}
restartCount: 0
initContainerStatuses:
- name: postgres-perms
image: docker.io/library/busybox:latest
ready: false
state:
waiting:
reason: CrashLoopBackOff
message: back-off 5m0s restarting failed container=postgres-perms
pod=postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0_httpbin(d21e1521-5a3f-4120-a446-bd7426199a20)
imageID: docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7
lastState:
terminated:
reason: Error
exitCode: 1
startedAt: 2024-06-20T06:27:47Z
finishedAt: 2024-06-20T06:27:48Z
containerID: containerd://082e4785dec00cf25f3001af654abdeb4197a9166aee8de5f3b6a2a9e6a9db94
containerID: containerd://082e4785dec00cf25f3001af654abdeb4197a9166aee8de5f3b6a2a9e6a9db94
restartCount: 470

0 comments on commit cba9d80

Please sign in to comment.