Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: improve aws health reporting #132

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 37 additions & 11 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ import (
"strings"
"time"

"github.com/samber/lo"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/duration"
)

var DefaultOverrides HealthOverride

type Health string

const (
Expand Down Expand Up @@ -40,13 +44,15 @@ const (
HealthStatusEvicted HealthStatusCode = "Evicted"
HealthStatusCompleted HealthStatusCode = "Completed"
HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff"
HealthStatusCrashLoop HealthStatusCode = "CrashLoop"
HealthStatusCrashed HealthStatusCode = "Crashed"
HealthStatusCreating HealthStatusCode = "Creating"
HealthStatusDeleted HealthStatusCode = "Deleted"
HealthStatusDeleting HealthStatusCode = "Deleting"
HealthStatusTerminating HealthStatusCode = "Terminating"
HealthStatusError HealthStatusCode = "Error"
HealthStatusRolloutFailed HealthStatusCode = "Rollout Failed"
HealthStatusInaccesible HealthStatusCode = "Inaccesible"
HealthStatusInaccesible HealthStatusCode = "Inaccessible"
HealthStatusInfo HealthStatusCode = "Info"
HealthStatusPending HealthStatusCode = "Pending"
HealthStatusMaintenance HealthStatusCode = "Maintenance"
Expand Down Expand Up @@ -100,12 +106,36 @@ func IsWorse(current, new HealthStatusCode) bool {
return newIndex > currentIndex
}

func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus {
if strings.HasPrefix(configType, "Mongo::") {
func GetHealthByConfigType(configType string, obj map[string]any, states ...string) HealthStatus {
switch configType {
case "AWS::ECS::Task":
return GetECSTaskHealth(obj)
}

configClass := strings.Split(configType, "::")[0]

switch strings.ToLower(configClass) {
case "mongo":
return GetMongoHealth(obj)
case "kubernetes", "crossplane", "missioncontrol", "flux", "argo":
hr, err := GetResourceHealth(&unstructured.Unstructured{Object: obj}, DefaultOverrides)
if hr != nil {
return *hr
}
if err != nil {
return HealthStatus{
Status: "HealthParseError",
Message: lo.Elipse(err.Error(), 500),
}
}
}

return HealthStatus{}
if len(states) > 0 {
return GetHealthFromStatusName(states[0])
}
return HealthStatus{
Health: HealthUnknown,
}
}

// GetResourceHealth returns the health of a k8s resource
Expand All @@ -118,8 +148,8 @@ func GetResourceHealth(
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor),
Health: HealthWarning,
Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))),
}, nil
}

Expand Down Expand Up @@ -169,10 +199,6 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getNodeHealth
}

if strings.HasSuffix(gvk.Group, ".crossplane.io") || strings.HasSuffix(gvk.Group, ".upbound.io") {
return GetDefaultHealth
}

switch gvk.Group {
case "apps":
switch gvk.Kind {
Expand Down Expand Up @@ -235,5 +261,5 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getHPAHealth
}
}
return nil
return GetDefaultHealth
}
142 changes: 2 additions & 140 deletions pkg/health/health_aws.go
Original file line number Diff line number Diff line change
@@ -1,143 +1,5 @@
package health

import (
"strings"

"github.com/samber/lo"
)

const (
AWSResourceTypeEBS string = "ebs"
AWSResourceTypeEC2 string = "ec2"
AWSResourceTypeEKS string = "eks"
AWSResourceTypeELB string = "elb"
AWSResourceTypeRDS string = "rds"
AWSResourceTypeVPC string = "vpc"
AWSResourceTypeSubnet string = "subnet"
AWSResourceTypeCloudformationStack string = "cloudformationstack"
)

func GetAWSResourceHealth(resourceType, status string) (health HealthStatus) {
if resourceStatuses, found := awsResourceHealthmap[resourceType]; found {
if v, found := resourceStatuses[strings.ToLower(status)]; found {
v.Status = HealthStatusCode(
lo.Capitalize(strings.ReplaceAll(strings.ReplaceAll(status, "-", " "), "_", " ")),
)
return v
}
}

return HealthStatus{
Status: HealthStatusUnknown,
Health: HealthUnknown,
Ready: false,
}
}

var awsResourceHealthmap = map[string]map[string]HealthStatus{
AWSResourceTypeCloudformationStack: {
"create_complete": HealthStatus{Health: HealthHealthy, Ready: true},
"create_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"create_in_progress": HealthStatus{Health: HealthUnknown},
"delete_complete": HealthStatus{Health: HealthUnknown, Ready: true},
"delete_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"delete_in_progress": HealthStatus{Health: HealthUnknown},
"import_complete": HealthStatus{Health: HealthHealthy, Ready: true},
"import_in_progress": HealthStatus{Health: HealthUnknown},
"import_rollback_complete": HealthStatus{Health: HealthWarning, Ready: true},
"import_rollback_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"import_rollback_in_progress": HealthStatus{Health: HealthWarning},
"review_in_progress": HealthStatus{Health: HealthUnknown},
"rollback_complete": HealthStatus{Health: HealthWarning, Ready: true},
"rollback_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"rollback_in_progress": HealthStatus{Health: HealthWarning},
"update_complete_cleanup_in_progress": HealthStatus{Health: HealthUnknown},
"update_complete": HealthStatus{Health: HealthHealthy, Ready: true},
"update_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"update_in_progress": HealthStatus{Health: HealthUnknown},
"update_rollback_complete_cleanup_in_progress": HealthStatus{Health: HealthUnknown},
"update_rollback_complete": HealthStatus{Health: HealthWarning, Ready: true},
"update_rollback_failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"update_rollback_in_progress": HealthStatus{Health: HealthWarning},
},

AWSResourceTypeEC2: {
"pending": HealthStatus{Health: HealthUnknown},
"running": HealthStatus{Health: HealthHealthy, Ready: true},
"shutting-down": HealthStatus{Health: HealthUnknown},
"stopped": HealthStatus{Health: HealthUnknown, Ready: true},
"stopping": HealthStatus{Health: HealthUnknown},
"terminated": HealthStatus{Health: HealthUnknown, Ready: true},
},

AWSResourceTypeEKS: {
"creating": HealthStatus{Health: HealthUnknown},
"active": HealthStatus{Health: HealthHealthy, Ready: true},
"deleting": HealthStatus{Health: HealthUnknown},
"failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"updating": HealthStatus{Health: HealthUnknown},
"pending": HealthStatus{Health: HealthUnknown},
},

AWSResourceTypeEBS: {
"creating": HealthStatus{Health: HealthUnknown},
"available": HealthStatus{Health: HealthHealthy, Ready: true},
"in-use": HealthStatus{Health: HealthHealthy, Ready: true},
"deleting": HealthStatus{Health: HealthUnknown},
"deleted": HealthStatus{Health: HealthUnknown, Ready: true},
"error": HealthStatus{Health: HealthUnhealthy, Ready: true},
},

// https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/accessing-monitoring.html
AWSResourceTypeRDS: {
"available": HealthStatus{Health: HealthHealthy, Ready: true},
"billed": HealthStatus{Health: HealthHealthy, Ready: true},
"backing-up": HealthStatus{Health: HealthHealthy},
"configuring-enhanced-monitoring": HealthStatus{Health: HealthHealthy},
"configuring-iam-database-auth": HealthStatus{Health: HealthHealthy},
"configuring-log-exports": HealthStatus{Health: HealthHealthy},
"converting-to-vpc": HealthStatus{Health: HealthHealthy, Ready: true},
"creating": HealthStatus{Health: HealthUnknown},
"delete-precheck": HealthStatus{Health: HealthHealthy},
"deleting": HealthStatus{Health: HealthUnknown},
"failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
"inaccessible-encryption-credentials": HealthStatus{Health: HealthUnhealthy, Ready: true},
"inaccessible-encryption-credentials-recoverable": HealthStatus{Health: HealthWarning, Ready: true},
"incompatible-network": HealthStatus{Health: HealthUnhealthy, Ready: true},
"incompatible-option-group": HealthStatus{Health: HealthUnhealthy, Ready: true},
"incompatible-parameters": HealthStatus{Health: HealthUnhealthy, Ready: true},
"incompatible-restore": HealthStatus{Health: HealthUnhealthy, Ready: true},
"insufficient-capacity": HealthStatus{Health: HealthUnhealthy, Ready: true},
"maintenance": HealthStatus{Health: HealthHealthy},
"modifying": HealthStatus{Health: HealthHealthy, Ready: true},
"moving-to-vpc": HealthStatus{Health: HealthHealthy},
"rebooting": HealthStatus{Health: HealthHealthy},
"resetting-master-credentials": HealthStatus{Health: HealthHealthy},
"renaming": HealthStatus{Health: HealthHealthy},
"restore-error": HealthStatus{Health: HealthUnhealthy, Ready: true},
"starting": HealthStatus{Health: HealthUnknown},
"stopped": HealthStatus{Health: HealthHealthy, Ready: true},
"stopping": HealthStatus{Health: HealthUnknown},
"storage-config-upgrade": HealthStatus{Health: HealthHealthy},
"storage-full": HealthStatus{Health: HealthUnhealthy},
"storage-optimization": HealthStatus{Health: HealthHealthy},
"upgrading": HealthStatus{Health: HealthHealthy},
},

AWSResourceTypeELB: {
"active": HealthStatus{Health: HealthHealthy, Ready: true},
"provisioning": HealthStatus{Health: HealthUnknown},
"active_impaired": HealthStatus{Health: HealthWarning, Ready: true},
"failed": HealthStatus{Health: HealthUnhealthy, Ready: true},
},

AWSResourceTypeVPC: {
"pending": HealthStatus{Health: HealthUnknown},
"available": HealthStatus{Health: HealthHealthy, Ready: true},
},

AWSResourceTypeSubnet: {
"pending": HealthStatus{Health: HealthUnknown},
"available": HealthStatus{Health: HealthHealthy, Ready: true},
},
func GetAWSResourceHealth(_, status string) (health HealthStatus) {
return GetHealthFromStatusName(status)
}
72 changes: 72 additions & 0 deletions pkg/health/health_aws_ecs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package health

import (
"strings"

"github.com/samber/lo"
)

func GetECSTaskHealth(obj map[string]any) (health HealthStatus) {
hr := HealthStatus{
Status: HealthStatusCode(lo.CamelCase(obj["LastStatus"].(string))),
Health: HealthUnknown,
Ready: false,
}

if v, ok := obj["HealthStatus"].(string); ok {
hr.Health = Health(lo.CamelCase(v))
}

switch hr.Status {
case "RUNNING":
hr.Health = HealthHealthy
hr.Ready = true
case "STOPPED", "DELETED":
hr.Ready = true
hr.Health = HealthUnknown
}

stopCode, _ := obj["StopCode"].(string)

if stopCode != "" {
hr.Status = HealthStatusCode(stopCode)
}
switch stopCode {
case "TaskFailedToStart":
hr.Health = HealthUnhealthy
case "EssentialContainerExited":
hr.Status = HealthStatusCrashed
hr.Health = HealthUnhealthy
case "UserInitiated":
hr.Status = HealthStatusStopped
case "ServiceSchedulerInitiated":
hr.Status = HealthStatusTerminating
}

if reason, ok := obj["StoppedReason"].(string); ok {
idx := strings.Index(reason, ":")

if idx > 0 {
hr.Status = HealthStatusCode(reason[0:idx])
if len(reason) >= idx+1 {
hr.Message = strings.TrimSpace(reason[idx+1:])
}

switch hr.Status {
case "ContainerRuntimeError", "ContainerRuntimeTimeoutError", "OutOfMemoryError":
hr.Health = HealthUnhealthy
case "InternalError", "CannotCreateVolumeError", "ResourceNotFoundException", "CannotStartContainerError":
hr.Health = HealthUnhealthy
hr.Ready = true
case "SpotInterruptionError", "CannotStopContainerError", "CannotInspectContainerError":
hr.Health = HealthWarning
case "TaskFailedToStart", "ResourceInitializationError", "CannotPullContainer":
hr.Health = HealthUnhealthy
default:
hr.Health = HealthUnhealthy
}
}
}

return hr
}
18 changes: 18 additions & 0 deletions pkg/health/health_aws_ecs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package health_test

import (
"testing"

"github.com/flanksource/is-healthy/pkg/health"
)

func TestECSTask(t *testing.T) {
assertAppHealthMsg(
t,
"AWS::ECS::Task/failed.yaml",
"CannotPullContainerError",
health.HealthUnhealthy,
false,
"pull image manifest has been retried 5 time(s): failed to resolve ref docker.com/iiab-processing-fargate:dev: failed to do request: Head \"https://docker.com/v2/iiab-processing-fargate/manifests/dev\": dial tcp 10.0.0.1:443: connect: connection refused",
)
}
12 changes: 6 additions & 6 deletions pkg/health/health_aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ func TestMapAWSStatus(t *testing.T) {
}{
{
name: "ec2",
args: args{status: "shutting-down", resourceType: AWSResourceTypeEC2},
args: args{status: "shutting-down", resourceType: ""},
want: "Shutting Down",
},
{
name: "unknown resource",
args: args{status: "shutting-down", resourceType: "blob"},
want: HealthStatusUnknown,
want: "Shutting Down",
},
{
name: "unknown type",
args: args{status: "wakingup", resourceType: AWSResourceTypeEC2},
want: HealthStatusUnknown,
name: "Wakingup",
args: args{status: "wakingup", resourceType: ""},
want: "Wakingup",
},
{
name: "cloudformation",
args: args{status: "import_rollback_complete", resourceType: AWSResourceTypeCloudformationStack},
args: args{status: "import_rollback_complete", resourceType: ""},
want: HealthStatusCode("Import Rollback Complete"),
},
}
Expand Down
5 changes: 2 additions & 3 deletions pkg/health/health_cert_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
)

var defaultCertExpiryWarningPeriod = time.Hour * 24 * 2
Expand All @@ -18,7 +17,7 @@ func SetDefaultCertificateExpiryWarningPeriod(p time.Duration) {

func GetCertificateRequestHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
var certReq certmanagerv1.CertificateRequest
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &certReq); err != nil {
if err := convertFromUnstructured(obj, &certReq); err != nil {
return nil, fmt.Errorf("failed to convert unstructured certificateRequest to typed: %w", err)
}

Expand Down Expand Up @@ -84,7 +83,7 @@ func GetCertificateRequestHealth(obj *unstructured.Unstructured) (*HealthStatus,

func GetCertificateHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
var cert certmanagerv1.Certificate
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &cert); err != nil {
if err := convertFromUnstructured(obj, &cert); err != nil {
return nil, fmt.Errorf("failed to convert unstructured certificate to typed: %w", err)
}

Expand Down
Loading