From cba9d80c274966bfabba70536c82fd8dcf46af3e Mon Sep 17 00:00:00 2001 From: Aditya Thebe Date: Thu, 20 Jun 2024 18:01:27 +0545 Subject: [PATCH] fix: check init container status when pod is pending --- pkg/health/health_pod.go | 36 +++- pkg/health/health_test.go | 1 + .../testdata/pod-init-container-fail.yaml | 171 ++++++++++++++++++ 3 files changed, 200 insertions(+), 8 deletions(-) create mode 100644 pkg/health/testdata/pod-init-container-fail.yaml diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 351d7c2..31c1e7d 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -42,6 +42,20 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { } } + getCommonContainerError := func(containerStatus *corev1.ContainerStatus) *HealthStatus { + waiting := containerStatus.State.Waiting + // Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf + if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) { + return &HealthStatus{ + Status: HealthStatusCode(waiting.Reason), + Health: HealthUnhealthy, + Message: waiting.Message, + } + } + + return nil + } + // This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure, // corev1.RestartPolicyNever, otherwise it breaks the resource hook logic. // The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook, @@ -54,12 +68,10 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { var messages []string for _, containerStatus := range pod.Status.ContainerStatuses { - waiting := containerStatus.State.Waiting - // Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf - if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) { - health = HealthUnhealthy - status = HealthStatusCode(waiting.Reason) - messages = append(messages, waiting.Message) + if msg := getCommonContainerError(&containerStatus); msg != nil { + health = msg.Health + status = msg.Status + messages = append(messages, msg.Message) } } @@ -89,11 +101,18 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { switch pod.Status.Phase { case corev1.PodPending: + for _, ctrStatus := range pod.Status.InitContainerStatuses { + if msg := getCommonContainerError(&ctrStatus); msg != nil { + return msg, nil + } + } + return &HealthStatus{ Health: HealthUnknown, Status: HealthStatusPending, Message: pod.Status.Message, }, nil + case corev1.PodSucceeded: return &HealthStatus{ Health: HealthHealthy, @@ -101,6 +120,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Ready: true, Message: pod.Status.Message, }, nil + case corev1.PodFailed: if pod.Status.Message != "" { // Pod has a nice error message. Use that. @@ -113,6 +133,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { } return &HealthStatus{Health: HealthUnhealthy, Status: HealthStatusError, Message: "", Ready: true}, nil + case corev1.PodRunning: switch pod.Spec.RestartPolicy { case corev1.RestartPolicyAlways: @@ -142,8 +163,8 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Status: HealthStatusStarting, Message: pod.Status.Message, }, nil - case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: + case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: if isReady { return &HealthStatus{ Health: HealthHealthy, @@ -155,7 +176,6 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { Status: HealthStatusRunning, }, nil } - } } diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index a55a74f..bfd4167 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -131,6 +131,7 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true) assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true) assertAppHealth(t, "./testdata/pod-deletion.yaml", health.HealthStatusTerminating, health.HealthUnhealthy, false) + assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false) } // func TestAPIService(t *testing.T) { diff --git a/pkg/health/testdata/pod-init-container-fail.yaml b/pkg/health/testdata/pod-init-container-fail.yaml new file mode 100644 index 0000000..75d0f0c --- /dev/null +++ b/pkg/health/testdata/pod-init-container-fail.yaml @@ -0,0 +1,171 @@ +apiVersion: v1 +kind: Pod +metadata: + uid: d21e1521-5a3f-4120-a446-bd7426199a20 + name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0 + labels: + app: postgresql + controller-revision-hash: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-988f9fc65 + statefulset.kubernetes.io/pod-name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0 + namespace: httpbin + generateName: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc- + ownerReferences: + - uid: da460101-eebb-4d4f-b8f4-acb8908d7083 + kind: StatefulSet + name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc + apiVersion: apps/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-06-18T14:48:55Z +spec: + volumes: + - name: postgresql + persistentVolumeClaim: + claimName: postgresql-postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0 + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 256Mi + - name: logs + emptyDir: {} + - name: kube-api-access-6qtdz + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + defaultMode: 420 + hostname: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0 + nodeName: ip-10-0-6-40.eu-west-1.compute.internal + priority: 0 + dnsPolicy: ClusterFirst + subdomain: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc + containers: + - env: + - name: PGDATA + value: /var/lib/postgresql/data + - name: POSTGRES_DB + value: mission_control + name: postgresql + image: postgres:14 + resources: + requests: + memory: 4Gi + limits: + memory: 5Gi + cpu: 2 + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: postgresql + subPath: postgres + mountPath: /var/lib/postgresql/data + - name: logs + mountPath: /var/log/postgresql + - name: kube-api-access-6qtdz + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tolerations: + - key: node.kubernetes.io/not-ready + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + restartPolicy: Always + schedulerName: default-scheduler + initContainers: + - name: postgres-perms + image: busybox + command: + - sh + - -c + - mkdir -p /postgres \u0026\u0026 chmod -R 0750 /postgres \u0026\u0026 + chown 999:999 -R /postgres \u0026\u0026 chmod -R 777 /dev/shm + resources: {} + volumeMounts: + - name: postgresql + mountPath: /postgres + - name: dshm + mountPath: /dev/shm + - name: kube-api-access-6qtdz + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + imagePullPolicy: Always + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + serviceAccount: default + securityContext: {} + preemptionPolicy: PreemptLowerPriority + enableServiceLinks: true + serviceAccountName: default + terminationGracePeriodSeconds: 30 +status: + phase: Pending + podIP: 10.0.6.127 + hostIP: 10.0.6.40 + podIPs: + - ip: 10.0.6.127 + qosClass: Burstable + startTime: 2024-06-18T14:48:59Z + conditions: + - type: Initialized + reason: ContainersNotInitialized + status: "False" + message: "containers with incomplete status: [postgres-perms]" + - type: Ready + reason: ContainersNotReady + status: "False" + message: "containers with unready status: [postgresql]" + - type: ContainersReady + reason: ContainersNotReady + status: "False" + message: "containers with unready status: [postgresql]" + - type: PodScheduled + status: "True" + containerStatuses: + - name: postgresql + image: postgres:14 + ready: false + state: + waiting: + reason: PodInitializing + imageID: "" + started: false + lastState: {} + restartCount: 0 + initContainerStatuses: + - name: postgres-perms + image: docker.io/library/busybox:latest + ready: false + state: + waiting: + reason: CrashLoopBackOff + message: back-off 5m0s restarting failed container=postgres-perms + pod=postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0_httpbin(d21e1521-5a3f-4120-a446-bd7426199a20) + imageID: docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7 + lastState: + terminated: + reason: Error + exitCode: 1 + startedAt: 2024-06-20T06:27:47Z + finishedAt: 2024-06-20T06:27:48Z + containerID: containerd://082e4785dec00cf25f3001af654abdeb4197a9166aee8de5f3b6a2a9e6a9db94 + containerID: containerd://082e4785dec00cf25f3001af654abdeb4197a9166aee8de5f3b6a2a9e6a9db94 + restartCount: 470