diff --git a/client/pkg/omni/resources/omni/labels.go b/client/pkg/omni/resources/omni/labels.go index 3f13622fa..4ddcd47ae 100644 --- a/client/pkg/omni/resources/omni/labels.go +++ b/client/pkg/omni/resources/omni/labels.go @@ -46,9 +46,6 @@ const ( // tsgen:LabelMachine LabelMachine = SystemLabelPrefix + "machine" - // LabelSkipTeardown is the test label that configures machine set controller to skip teardown sequence for the cluster machine. - LabelSkipTeardown = SystemLabelPrefix + "machine-set-skip-teardown" - // LabelSystemPatch marks the patch as the system patch, so it shouldn't be editable by the user. // tsgen:LabelSystemPatch LabelSystemPatch = SystemLabelPrefix + "system-patch" diff --git a/cmd/omni-integration-test/pkg/tests/machines.go b/cmd/omni-integration-test/pkg/tests/machines.go index 3d9c46b3e..0526f0f99 100644 --- a/cmd/omni-integration-test/pkg/tests/machines.go +++ b/cmd/omni-integration-test/pkg/tests/machines.go @@ -180,7 +180,7 @@ func AssertForceRemoveWorkerNode(testCtx context.Context, st state.State, cluste // The VM is wiped & rebooted to bring it back as an available machine. func AssertControlPlaneForceReplaceMachine(testCtx context.Context, st state.State, clusterName string, options Options) TestFunc { return func(t *testing.T) { - ctx, cancel := context.WithTimeout(testCtx, 90*time.Second) + ctx, cancel := context.WithTimeout(testCtx, 5*time.Minute) defer cancel() if options.WipeAMachineFunc == nil { diff --git a/internal/backend/runtime/omni/controllers/helpers/helpers.go b/internal/backend/runtime/omni/controllers/helpers/helpers.go new file mode 100644 index 000000000..0e0dbc13a --- /dev/null +++ b/internal/backend/runtime/omni/controllers/helpers/helpers.go @@ -0,0 +1,129 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +// Package helpers contains common utility methods for COSI controllers of Omni. +package helpers + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "strings" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/resource/kvutils" + "github.com/siderolabs/gen/xslices" + + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" +) + +// InputResourceVersionAnnotation is the annotation name where the inputs version sha is stored. +const InputResourceVersionAnnotation = "inputResourceVersion" + +// UpdateInputsVersions generates a hash of the resource by combining its inputs. +func UpdateInputsVersions[T resource.Resource](out resource.Resource, inputs ...T) bool { + return UpdateInputsAnnotation(out, xslices.Map(inputs, func(input T) string { + return fmt.Sprintf("%s/%s@%s", input.Metadata().Type(), input.Metadata().ID(), input.Metadata().Version()) + })...) +} + +// UpdateInputsAnnotation updates the annotation with the input resource version and returns if it has changed. +func UpdateInputsAnnotation(out resource.Resource, versions ...string) bool { + hash := sha256.New() + + for i, version := range versions { + if i > 0 { + hash.Write([]byte(",")) + } + + hash.Write([]byte(version)) + } + + inVersion := hex.EncodeToString(hash.Sum(nil)) + + version, found := out.Metadata().Annotations().Get(InputResourceVersionAnnotation) + + if found && version == inVersion { + return false + } + + out.Metadata().Annotations().Set(InputResourceVersionAnnotation, inVersion) + + return true +} + +// CopyAllLabels copies all labels from one resource to another. +func CopyAllLabels(src, dst resource.Resource) { + dst.Metadata().Labels().Do(func(tmp kvutils.TempKV) { + for key, value := range src.Metadata().Labels().Raw() { + tmp.Set(key, value) + } + }) +} + +// CopyLabels copies the labels from one resource to another. +func CopyLabels(src, dst resource.Resource, keys ...string) { + dst.Metadata().Labels().Do(func(tmp kvutils.TempKV) { + for _, key := range keys { + if label, ok := src.Metadata().Labels().Get(key); ok { + tmp.Set(key, label) + } + } + }) +} + +// CopyAllAnnotations copies all annotations from one resource to another. +func CopyAllAnnotations(src, dst resource.Resource) { + dst.Metadata().Annotations().Do(func(tmp kvutils.TempKV) { + for key, value := range src.Metadata().Annotations().Raw() { + tmp.Set(key, value) + } + }) +} + +// CopyAnnotations copies annotations from one resource to another. +func CopyAnnotations(src, dst resource.Resource, annotations ...string) { + dst.Metadata().Annotations().Do(func(tmp kvutils.TempKV) { + for _, key := range annotations { + if label, ok := src.Metadata().Annotations().Get(key); ok { + tmp.Set(key, label) + } + } + }) +} + +// CopyUserLabels copies all user labels from one resource to another. +// It removes all user labels on the target that are not present in the source resource. +// System labels are not copied. +func CopyUserLabels(target resource.Resource, labels map[string]string) { + ClearUserLabels(target) + + if len(labels) == 0 { + return + } + + target.Metadata().Labels().Do(func(tmp kvutils.TempKV) { + for key, value := range labels { + if strings.HasPrefix(key, omni.SystemLabelPrefix) { + continue + } + + tmp.Set(key, value) + } + }) +} + +// ClearUserLabels removes all user labels from the resource. +func ClearUserLabels(res resource.Resource) { + res.Metadata().Labels().Do(func(tmp kvutils.TempKV) { + for key := range res.Metadata().Labels().Raw() { + if strings.HasPrefix(key, omni.SystemLabelPrefix) { + continue + } + + tmp.Delete(key) + } + }) +} diff --git a/internal/backend/runtime/omni/controllers/helpers/helpers_test.go b/internal/backend/runtime/omni/controllers/helpers/helpers_test.go new file mode 100644 index 000000000..536c2210c --- /dev/null +++ b/internal/backend/runtime/omni/controllers/helpers/helpers_test.go @@ -0,0 +1,36 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package helpers_test + +import ( + "testing" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/stretchr/testify/assert" + + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" +) + +func TestUpdateInputsVersions(t *testing.T) { + out := omni.NewCluster("default", "test") + + in := []resource.Resource{omni.NewMachine("default", "test1"), omni.NewMachine("default", "test2")} + + assert.True(t, helpers.UpdateInputsVersions(out, in...)) + + v, _ := out.Metadata().Annotations().Get("inputResourceVersion") + assert.Equal(t, "a7a451e614fc3b4a7241798235001fea271c7ad5493c392f0a012104379bdb89", v) + + assert.False(t, helpers.UpdateInputsVersions(out, in...)) + + in = append(in, omni.NewClusterMachine("default", "cm1")) + + assert.True(t, helpers.UpdateInputsVersions(out, in...)) + + v, _ = out.Metadata().Annotations().Get("inputResourceVersion") + assert.Equal(t, "df4af53c3caf7ae4c0446bcf8b854ed3f5740a47eab0e5151f1962a4a4d52f6f", v) +} diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config.go index 92ea6f67a..2c1284c03 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config.go @@ -31,11 +31,14 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" appconfig "github.com/siderolabs/omni/internal/pkg/config" "github.com/siderolabs/omni/internal/pkg/constants" ) +const clusterMachineConfigControllerName = "ClusterMachineConfigController" + // ClusterMachineConfigController manages machine configurations for each ClusterMachine. // // ClusterMachineConfigController generates machine configuration for each created machine. @@ -45,7 +48,7 @@ type ClusterMachineConfigController = qtransform.QController[*omni.ClusterMachin func NewClusterMachineConfigController(defaultGenOptions []generate.Option) *ClusterMachineConfigController { return qtransform.NewQController( qtransform.Settings[*omni.ClusterMachine, *omni.ClusterMachineConfig]{ - Name: "ClusterMachineConfigController", + Name: clusterMachineConfigControllerName, MapMetadataFunc: func(clusterMachine *omni.ClusterMachine) *omni.ClusterMachineConfig { return omni.NewClusterMachineConfig(resources.DefaultNamespace, clusterMachine.Metadata().ID()) }, @@ -204,11 +207,11 @@ func reconcileClusterMachineConfig( clusterMachineTalosVersion, } - if !UpdateInputsVersions(machineConfig, inputs...) { + if !helpers.UpdateInputsVersions(machineConfig, inputs...) { return xerrors.NewTagged[qtransform.SkipReconcileTag](errors.New("config inputs not changed")) } - machineConfig.Metadata().Labels().Set(omni.LabelCluster, clusterName) + helpers.CopyLabels(clusterMachine, machineConfig, omni.LabelMachineSet, omni.LabelCluster, omni.LabelControlPlaneRole, omni.LabelWorkerRole) // TODO: temporary transition code, remove in the future if clusterMachine.TypedSpec().Value.KubernetesVersion == "" { diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go index 094fd6861..2d323239d 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go @@ -33,12 +33,16 @@ import ( "github.com/siderolabs/omni/client/pkg/meta" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" talosutils "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/talos" "github.com/siderolabs/omni/internal/backend/runtime/talos" ) -const gracefulResetAttemptCount = 2 +const ( + gracefulResetAttemptCount = 4 + etcdLeaveAttemptsLimit = 2 +) // ClusterMachineConfigStatusController manages ClusterMachineStatus resource lifecycle. // @@ -161,6 +165,8 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro return fmt.Errorf("failed to apply config to machine '%s': %w", machineConfig.Metadata().ID(), err) } + helpers.CopyLabels(machineConfig, configStatus, omni.LabelMachineSet, omni.LabelCluster, omni.LabelControlPlaneRole, omni.LabelWorkerRole) + configStatus.TypedSpec().Value.ClusterMachineVersion = machineConfig.TypedSpec().Value.ClusterMachineVersion configStatus.TypedSpec().Value.ClusterMachineConfigVersion = machineConfig.Metadata().Version().String() configStatus.TypedSpec().Value.ClusterMachineConfigSha256 = shaSumString @@ -219,8 +225,8 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro } type resetStatus struct { - attempts uint - forceUngraceful bool + resetAttempts uint + etcdLeaveAttempts uint } type ongoingResets struct { @@ -243,7 +249,16 @@ func (r *ongoingResets) isGraceful(id resource.ID) bool { return true } - return !rs.forceUngraceful && rs.attempts < gracefulResetAttemptCount + return rs.resetAttempts < gracefulResetAttemptCount +} + +func (r *ongoingResets) shouldLeaveEtcd(id string) bool { + rs, ok := r.getStatus(id) + if !ok { + return true + } + + return rs.etcdLeaveAttempts < etcdLeaveAttemptsLimit } func (r *ongoingResets) handleReset(id resource.ID) uint { @@ -254,12 +269,12 @@ func (r *ongoingResets) handleReset(id resource.ID) uint { r.statuses[id] = &resetStatus{} } - r.statuses[id].attempts++ + r.statuses[id].resetAttempts++ - return r.statuses[id].attempts + return r.statuses[id].resetAttempts } -func (r *ongoingResets) forceUngraceful(id resource.ID) { +func (r *ongoingResets) handleEtcdLeave(id resource.ID) { r.mu.Lock() defer r.mu.Unlock() @@ -267,7 +282,7 @@ func (r *ongoingResets) forceUngraceful(id resource.ID) { r.statuses[id] = &resetStatus{} } - r.statuses[id].forceUngraceful = true + r.statuses[id].etcdLeaveAttempts++ } func (r *ongoingResets) deleteStatus(id resource.ID) { @@ -456,7 +471,7 @@ func (h *clusterMachineConfigStatusControllerHandler) reset( machineConfig *omni.ClusterMachineConfig, clusterMachine *omni.ClusterMachine, ) error { - ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() logger := h.logger.With( @@ -512,6 +527,10 @@ func (h *clusterMachineConfigStatusControllerHandler) reset( machineStage := statusSnapshot.TypedSpec().Value.GetMachineStatus().GetStage() + if machineStage == machineapi.MachineStatusEvent_RESETTING { + return controller.NewRequeueErrorf(time.Minute, "the machine is already being reset") + } + logger.Debug("getting ready to reset the machine", zap.Stringer("stage", machineStage)) inMaintenance := machineStage == machineapi.MachineStatusEvent_MAINTENANCE @@ -554,7 +573,7 @@ func (h *clusterMachineConfigStatusControllerHandler) reset( graceful = false } - isControlPlane := isControlPlane(clusterMachine) + _, isControlPlane := clusterMachine.Metadata().Labels().Get(omni.LabelControlPlaneRole) switch { // check that machine is ready to be reset @@ -587,19 +606,12 @@ func (h *clusterMachineConfigStatusControllerHandler) reset( } // if is control plane first leave etcd - if graceful && isControlPlane { - _, err = c.EtcdForfeitLeadership(ctx, &machineapi.EtcdForfeitLeadershipRequest{}) - if err != nil { - h.ongoingResets.forceUngraceful(clusterMachine.Metadata().ID()) + if isControlPlane && h.ongoingResets.shouldLeaveEtcd(clusterMachine.Metadata().ID()) { + h.ongoingResets.handleEtcdLeave(clusterMachine.Metadata().ID()) - return fmt.Errorf("failed to forfeit leadership, node %q: %w", machineConfig.Metadata().ID(), err) - } - - err = c.EtcdLeaveCluster(ctx, &machineapi.EtcdLeaveClusterRequest{}) + err = h.gracefulEtcdLeave(ctx, c, clusterMachine.Metadata().ID()) if err != nil { - h.ongoingResets.forceUngraceful(clusterMachine.Metadata().ID()) - - return fmt.Errorf("failed to leave etcd cluster, node %q: %w", machineConfig.Metadata().ID(), err) + return controller.NewRequeueError(err, time.Second) } } @@ -632,6 +644,20 @@ func (h *clusterMachineConfigStatusControllerHandler) reset( return fmt.Errorf("failed resetting node '%s': %w", machineConfig.Metadata().ID(), err) } +func (h *clusterMachineConfigStatusControllerHandler) gracefulEtcdLeave(ctx context.Context, c *client.Client, id string) error { + _, err := c.EtcdForfeitLeadership(ctx, &machineapi.EtcdForfeitLeadershipRequest{}) + if err != nil { + return fmt.Errorf("failed to forfeit leadership, node %q: %w", id, err) + } + + err = c.EtcdLeaveCluster(ctx, &machineapi.EtcdLeaveClusterRequest{}) + if err != nil { + return fmt.Errorf("failed to leave etcd cluster, node %q: %w", id, err) + } + + return nil +} + func (h *clusterMachineConfigStatusControllerHandler) getClient( ctx context.Context, useMaintenance bool, diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go index 4636ef9e7..d8ac89a79 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go @@ -362,16 +362,13 @@ func (suite *ClusterMachineConfigStatusSuite) TestResetUngraceful() { suite.destroyCluster(cluster) for _, m := range machines { - suite.Assert().NoError(retry.Constant(5*time.Second, retry.WithUnits(100*time.Millisecond)).Retry( + suite.Assert().NoError(retry.Constant(30*time.Second, retry.WithUnits(100*time.Millisecond)).Retry( suite.assertNoResource(*omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, m.Metadata().ID()).Metadata()), )) } - for i, m := range machines { - count := 3 - if i == brokenEtcdMachine { - count = 1 - } + for _, m := range machines { + count := 5 suite.Assert().Len(machineServices[m.Metadata().ID()].getResetRequests(), count) } diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_identity.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_identity.go index 4c048bf77..4860f6262 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_identity.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_identity.go @@ -17,6 +17,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task/clustermachine" ) @@ -75,7 +76,7 @@ func (ctrl *ClusterMachineIdentityController) Run(ctx context.Context, r control return nil case clusterMachineIdentity := <-notifyCh: err := safe.WriterModify(ctx, r, clusterMachineIdentity, func(res *omni.ClusterMachineIdentity) error { - CopyAllLabels(clusterMachineIdentity, res) + helpers.CopyAllLabels(clusterMachineIdentity, res) spec := clusterMachineIdentity.TypedSpec().Value if spec.EtcdMemberId != 0 { @@ -136,6 +137,11 @@ func (ctrl *ClusterMachineIdentityController) reconcileCollectors(ctx context.Co return fmt.Errorf("failed to determine the cluster of the cluster machine %s", id) } + machineSetName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelMachineSet) + if !ok { + return fmt.Errorf("failed to determine the machine set of the cluster machine %s", id) + } + machine, err := safe.ReaderGet[*omni.Machine](ctx, r, omni.NewMachine(resources.DefaultNamespace, id).Metadata(), ) @@ -173,6 +179,7 @@ func (ctrl *ClusterMachineIdentityController) reconcileCollectors(ctx context.Co machine.TypedSpec().Value.ManagementAddress, isControlPlane, clusterName, + machineSetName, ) } diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go index ebcc9d479..5a958a472 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_status.go @@ -19,6 +19,7 @@ import ( "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" ) // ClusterMachineStatusController reflects the status of a machine that is a member of a cluster. @@ -43,7 +44,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController { return err } - CopyLabels(clusterMachine, clusterMachineStatus, + helpers.CopyLabels(clusterMachine, clusterMachineStatus, omni.LabelCluster, omni.LabelControlPlaneRole, omni.LabelWorkerRole, @@ -142,6 +143,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController { cmsVal.ApidAvailable = machine.TypedSpec().Value.Connected } + // should we also mark it as not ready when the clustermachine is tearing down (?) cmsVal.Ready = status.GetStatus().GetReady() && machine.TypedSpec().Value.Connected if clusterMachine.Metadata().Phase() == resource.PhaseTearingDown { @@ -227,3 +229,7 @@ func configApplyStatus(spec *specs.ClusterMachineStatusSpec) specs.ConfigApplySt return specs.ConfigApplyStatus_FAILED } } + +func isOutdated(clusterMachine *omni.ClusterMachine, configStatus *omni.ClusterMachineConfigStatus) bool { + return configStatus.TypedSpec().Value.ClusterMachineVersion != clusterMachine.Metadata().Version().String() || configStatus.TypedSpec().Value.LastConfigError != "" +} diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_teardown.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_teardown.go new file mode 100644 index 000000000..3e9ca5415 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_teardown.go @@ -0,0 +1,378 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package omni + +import ( + "context" + "crypto/tls" + "fmt" + "net" + "net/url" + "time" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/controller/generic" + "github.com/cosi-project/runtime/pkg/controller/generic/qtransform" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" + serverpb "github.com/siderolabs/discovery-api/api/v1alpha1/server/pb" + "github.com/siderolabs/gen/optional" + "github.com/siderolabs/talos/pkg/machinery/constants" + "go.uber.org/zap" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime" + "github.com/siderolabs/omni/internal/backend/runtime/kubernetes" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" +) + +// ClusterMachineTeardownControllerName is the name of the ClusterMachineTeardownController. +const ClusterMachineTeardownControllerName = "ClusterMachineTeardownController" + +// ClusterMachineTeardownController processes additional teardown steps for a machine leaving a machine set. +type ClusterMachineTeardownController struct { + generic.NamedController +} + +// NewClusterMachineTeardownController initializes ClusterMachineTeardownController. +func NewClusterMachineTeardownController() *ClusterMachineTeardownController { + return &ClusterMachineTeardownController{ + NamedController: generic.NamedController{ + ControllerName: ClusterMachineTeardownControllerName, + }, + } +} + +// Settings implements controller.QController interface. +func (ctrl *ClusterMachineTeardownController) Settings() controller.QSettings { + return controller.QSettings{ + Inputs: []controller.Input{ + { + Namespace: resources.DefaultNamespace, + Type: omni.ClusterMachineType, + Kind: controller.InputQPrimary, + }, + { + Namespace: resources.DefaultNamespace, + Type: omni.ClusterType, + Kind: controller.InputQMappedDestroyReady, + }, + { + Namespace: resources.DefaultNamespace, + Type: omni.ClusterSecretsType, + Kind: controller.InputQMapped, + }, + { + Namespace: resources.DefaultNamespace, + Type: omni.ClusterMachineIdentityType, + Kind: controller.InputQMapped, + }, + // TODO: drop after adding nodes + members audit + { + Namespace: resources.DefaultNamespace, + Type: omni.MachineSetType, + Kind: controller.InputQMapped, + }, + }, + Concurrency: optional.Some[uint](4), + } +} + +// MapInput implements controller.QController interface. +func (ctrl *ClusterMachineTeardownController) MapInput(ctx context.Context, logger *zap.Logger, r controller.QRuntime, ptr resource.Pointer) ([]resource.Pointer, error) { + switch ptr.Type() { + case omni.ClusterType: + mapper := mappers.MapByClusterLabel[*omni.Cluster, *omni.ClusterMachine]() + + return mapper(ctx, logger, r, omni.NewCluster(ptr.Namespace(), ptr.ID())) + case omni.ClusterMachineIdentityType: + mapper := qtransform.MapperSameID[*omni.ClusterMachineIdentity, *omni.ClusterMachine]() + + return mapper(ctx, logger, r, omni.NewClusterMachineIdentity(ptr.Namespace(), ptr.ID())) + case omni.ClusterSecretsType: + mapper := mappers.MapByClusterLabel[*omni.ClusterSecrets, *omni.ClusterMachine]() + + return mapper(ctx, logger, r, omni.NewClusterSecrets(ptr.Namespace(), ptr.ID())) + case omni.MachineSetType: + machines, err := safe.ReaderListAll[*omni.ClusterMachine](ctx, r, state.WithLabelQuery( + resource.LabelEqual(omni.LabelMachineSet, ptr.ID()), + )) + if err != nil { + return nil, err + } + + return safe.Map(machines, func(r *omni.ClusterMachine) (resource.Pointer, error) { + return r.Metadata(), nil + }) + } + + return nil, fmt.Errorf("unexpected resource type %q", ptr.Type()) +} + +// Reconcile implements controller.QController interface. +func (ctrl *ClusterMachineTeardownController) Reconcile(ctx context.Context, logger *zap.Logger, r controller.QRuntime, ptr resource.Pointer) error { + clusterMachine, err := safe.ReaderGetByID[*omni.ClusterMachine](ctx, r, ptr.ID()) + if err != nil { + if state.IsNotFoundError(err) { + return nil + } + + return err + } + + clusterName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return fmt.Errorf("failed to determine the cluster name of the cluster machine %q", ptr.ID()) + } + + logger = logger.With(zap.String("machine", clusterMachine.Metadata().ID()), zap.String("cluster", clusterName)) + + cluster, err := safe.ReaderGetByID[*omni.Cluster](ctx, r, clusterName) + if err != nil && !state.IsNotFoundError(err) { + return err + } + + // if the cluster is running and the machine is running add finalizer with this controller name + if clusterMachine.Metadata().Phase() == resource.PhaseRunning { + if !clusterMachine.Metadata().Finalizers().Has(ctrl.Name()) { + return r.AddFinalizer(ctx, ptr, ctrl.Name()) + } + + return nil + } + + // no finalizer on the machine skip teardown + if !clusterMachine.Metadata().Finalizers().Has(ctrl.Name()) { + return nil + } + + // do not run teardown until the cluster machine config status controller does reset + if clusterMachine.Metadata().Finalizers().Has(clusterMachineConfigControllerName) { + logger.Info("skipping teardown, waiting for reset") + + return nil + } + + // remove finalizer without any actions if the cluster resource is either absent or is tearing down + if cluster == nil || cluster.Metadata().Phase() == resource.PhaseTearingDown { + return r.RemoveFinalizer(ctx, ptr, ctrl.Name()) + } + + ctx, cancel := context.WithTimeout(ctx, time.Second*20) + defer cancel() + + // teardown member and node + if err = ctrl.teardownNodeMember(ctx, r, logger, clusterMachine); err != nil { + logger.Warn("failed to teardown member or node for the cluster machine", zap.Error(err)) + + // TODO: remove the rest of this "IF" when we get nodes and members audit + machineSetName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelMachineSet) + if !ok { + return r.RemoveFinalizer(ctx, ptr, ctrl.Name()) + } + + machineSet, e := r.Get(ctx, omni.NewMachineSet(resources.DefaultNamespace, machineSetName).Metadata()) + if e != nil { + if state.IsNotFoundError(err) { + return r.RemoveFinalizer(ctx, ptr, ctrl.Name()) + } + + return err + } + + // Ignore teardown errors for CP nodes which machine set is being torn down + if _, ok := machineSet.Metadata().Labels().Get(omni.LabelControlPlaneRole); ok && machineSet.Metadata().Phase() == resource.PhaseTearingDown { + return r.RemoveFinalizer(ctx, ptr, ctrl.Name()) + } + + return err + } + + logger.Info("cleaned up member and node for cluster machine") + + return r.RemoveFinalizer(ctx, ptr, ctrl.Name()) +} + +func (ctrl *ClusterMachineTeardownController) teardownNodeMember( + ctx context.Context, + r controller.QRuntime, + logger *zap.Logger, + clusterMachine *omni.ClusterMachine, +) error { + clusterName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return fmt.Errorf("failed to determine cluster name of the cluster machine %s", clusterMachine.Metadata().ID()) + } + + list, err := safe.ReaderListAll[*omni.ClusterMachineIdentity]( + ctx, + r, + state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)), + ) + if err != nil { + return fmt.Errorf("error listing cluster %q machine identities: %w", clusterName, err) + } + + nodeNameOccurences := map[string]int{} + clusterMachineIdentities := map[string]*omni.ClusterMachineIdentity{} + + list.ForEach(func(res *omni.ClusterMachineIdentity) { + clusterMachineIdentities[res.Metadata().ID()] = res + nodeNameOccurences[res.TypedSpec().Value.Nodename]++ + }) + + clusterMachineIdentity := clusterMachineIdentities[clusterMachine.Metadata().ID()] + + secrets, err := safe.ReaderGet[*omni.ClusterSecrets](ctx, r, omni.NewClusterSecrets( + resources.DefaultNamespace, + clusterName, + ).Metadata()) + if err != nil { + if state.IsNotFoundError(err) { + return nil + } + + return fmt.Errorf("failed to get cluster %q secrets: %w", clusterName, err) + } + + bundle, err := omni.ToSecretsBundle(secrets) + if err != nil { + return fmt.Errorf("failed to convert cluster %q secrets to bundle: %w", clusterName, err) + } + + if clusterMachineIdentity == nil { + return nil + } + + if nodeNameOccurences[clusterMachineIdentity.TypedSpec().Value.Nodename] == 1 { + if err = ctrl.teardownNode(ctx, clusterMachine, clusterMachineIdentity); err != nil { + return fmt.Errorf("error tearing down node %q: %w", clusterMachineIdentity.TypedSpec().Value.Nodename, err) + } + } + + if err = ctrl.deleteMember(ctx, r, bundle.Cluster.ID, clusterMachine, logger); err != nil { + return fmt.Errorf( + "error deleting member %q: %w", + clusterMachineIdentity.TypedSpec().Value.Nodename, + err, + ) + } + + return nil +} + +func (ctrl *ClusterMachineTeardownController) deleteMember( + ctx context.Context, + r controller.ReaderWriter, + clusterID string, + clusterMachine *omni.ClusterMachine, + logger *zap.Logger, +) error { + clusterMachineIdentity, err := safe.ReaderGet[*omni.ClusterMachineIdentity]( + ctx, + r, + omni.NewClusterMachineIdentity(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), + ) + if err != nil { + if state.IsNotFoundError(err) { + return nil + } + + return fmt.Errorf("error getting identity: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + + conn, err := ctrl.createDiscoveryClient(ctx) + if err != nil { + return fmt.Errorf("error creating discovery client: %w", err) + } + + defer func() { + if err = conn.Close(); err != nil { + logger.Error("error closing discovery client connection", zap.Error(err)) + } + }() + + discoveryClient := serverpb.NewClusterClient(conn) + + _, err = discoveryClient.AffiliateDelete(ctx, &serverpb.AffiliateDeleteRequest{ + ClusterId: clusterID, + AffiliateId: clusterMachineIdentity.TypedSpec().Value.NodeIdentity, + }) + if err != nil { + return fmt.Errorf( + "error deleting member %q: %w", + clusterMachineIdentity.TypedSpec().Value.NodeIdentity, + err, + ) + } + + return nil +} + +func (ctrl *ClusterMachineTeardownController) teardownNode( + ctx context.Context, + clusterMachine *omni.ClusterMachine, + clusterMachineIdentity *omni.ClusterMachineIdentity, +) error { + clusterName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return fmt.Errorf("cluster machine %s doesn't have cluster label set", clusterMachine.Metadata().ID()) + } + + type kubeRuntime interface { + GetClient(ctx context.Context, cluster string) (*kubernetes.Client, error) + } + + k8s, err := runtime.LookupInterface[kubeRuntime](kubernetes.Name) + if err != nil { + return err + } + + k8sClient, err := k8s.GetClient(ctx, clusterName) + if err != nil { + return fmt.Errorf("error getting kubernetes client for cluster %q: %w", clusterName, err) + } + + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + defer cancel() + + nodename := clusterMachineIdentity.TypedSpec().Value.Nodename + + err = k8sClient.Clientset().CoreV1().Nodes().Delete(ctx, nodename, metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("error deleting node %q in cluster %q: %w", nodename, clusterName, err) + } + + return nil +} + +func (ctrl *ClusterMachineTeardownController) createDiscoveryClient(ctx context.Context) (*grpc.ClientConn, error) { + u, err := url.Parse(constants.DefaultDiscoveryServiceEndpoint) + if err != nil { + return nil, err + } + + discoveryConn, err := grpc.DialContext(ctx, net.JoinHostPort(u.Host, "443"), + grpc.WithTransportCredentials( + credentials.NewTLS(&tls.Config{}), + ), + grpc.WithSharedWriteBuffer(true), + ) + if err != nil { + return nil, err + } + + return discoveryConn, nil +} diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_status.go index d1a5c0eb1..d5409ae2d 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_status.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_status.go @@ -18,6 +18,7 @@ import ( "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" ) @@ -135,7 +136,7 @@ func NewClusterStatusController() *ClusterStatusController { HasConnectedControlPlanes: clusterHasConnectedControlPlanes, } - CopyUserLabels(clusterStatus, cluster.Metadata().Labels().Raw()) + helpers.CopyUserLabels(clusterStatus, cluster.Metadata().Labels().Raw()) return nil }, diff --git a/internal/backend/runtime/omni/controllers/omni/image_pull_status.go b/internal/backend/runtime/omni/controllers/omni/image_pull_status.go index 642a9dd9b..d2b344922 100644 --- a/internal/backend/runtime/omni/controllers/omni/image_pull_status.go +++ b/internal/backend/runtime/omni/controllers/omni/image_pull_status.go @@ -16,6 +16,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/image" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task" imagetask "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task/image" @@ -90,7 +91,7 @@ func (ctrl *ImagePullStatusController) updatePullStatus(ctx context.Context, r c if err := safe.WriterModify[*omni.ImagePullStatus](ctx, r, omni.NewImagePullStatus(resources.DefaultNamespace, pullStatus.Request.Metadata().ID()), func(status *omni.ImagePullStatus) error { - CopyAllLabels(pullStatus.Request, status) + helpers.CopyAllLabels(pullStatus.Request, status) status.TypedSpec().Value.LastProcessedNode = pullStatus.Node status.TypedSpec().Value.LastProcessedImage = pullStatus.Image diff --git a/internal/backend/runtime/omni/controllers/omni/internal/configpatch/configpatch.go b/internal/backend/runtime/omni/controllers/omni/internal/configpatch/configpatch.go index ebd514219..618e6cf4d 100644 --- a/internal/backend/runtime/omni/controllers/omni/internal/configpatch/configpatch.go +++ b/internal/backend/runtime/omni/controllers/omni/internal/configpatch/configpatch.go @@ -13,6 +13,7 @@ import ( "github.com/cosi-project/runtime/pkg/controller" "github.com/cosi-project/runtime/pkg/resource" "github.com/cosi-project/runtime/pkg/safe" + "github.com/siderolabs/gen/xslices" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" ) @@ -80,5 +81,7 @@ func (h *Helper) Get(machine *omni.ClusterMachine, machineSet *omni.MachineSet) patches = append(patches, patch) } - return patches, nil + return xslices.Filter(patches, func(configPatch *omni.ConfigPatch) bool { + return configPatch.Metadata().Phase() == resource.PhaseRunning + }), nil } diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler.go new file mode 100644 index 000000000..5ebb488f6 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler.go @@ -0,0 +1,69 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +import ( + "context" + + "github.com/siderolabs/gen/xslices" + + "github.com/siderolabs/omni/internal/backend/runtime/omni/pkg/check" +) + +// ReconcileControlPlanes gets the reconciliation context and produces the list of changes to apply on the machine set. +func ReconcileControlPlanes(ctx context.Context, rc *ReconciliationContext, etcdStatusGetter func(ctx context.Context) (*check.EtcdStatusResult, error)) ([]Operation, error) { + toCreate := xslices.Map(rc.GetMachinesToCreate(), func(id string) Operation { + return &Create{ID: id} + }) + + // create all machines and exit + if len(toCreate) > 0 { + return toCreate, nil + } + + // destroy all ready for destroy machines + if len(rc.GetMachinesToDestroy()) > 0 { + return xslices.Map(rc.GetMachinesToDestroy(), func(id string) Operation { + return &Destroy{ID: id} + }), nil + } + + if !rc.LBHealthy() { + return nil, nil + } + + // pending tearing down machines with finalizers should cancel any other operations + if len(rc.GetTearingDownMachines()) > 0 { + return nil, nil + } + + // do a single destroy + for _, id := range rc.GetMachinesToTeardown() { + clusterMachine, ok := rc.GetClusterMachine(id) + if !ok { + continue + } + + etcdStatus, err := etcdStatusGetter(ctx) + if err != nil { + return nil, err + } + + if err := check.CanScaleDown(etcdStatus, clusterMachine); err != nil { + return nil, err + } + + return []Operation{&Teardown{ID: id}}, nil + } + + // do a single update + toUpdate := rc.GetMachinesToUpdate() + if len(toUpdate) > 0 { + return []Operation{&Update{ID: toUpdate[0]}}, nil + } + + return nil, nil +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler_test.go new file mode 100644 index 000000000..3bab6c254 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/control_planes_handler_test.go @@ -0,0 +1,247 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "context" + "testing" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/stretchr/testify/require" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" + "github.com/siderolabs/omni/internal/backend/runtime/omni/pkg/check" +) + +func TestControlPlanesHandler(t *testing.T) { + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "test") + + version := resource.VersionUndefined.Next() + + //nolint:govet + for _, tt := range []struct { + name string + machineSet *specs.MachineSetSpec + machineSetNodes []*omni.MachineSetNode + clusterMachines []*omni.ClusterMachine + clusterMachineConfigStatuses []*omni.ClusterMachineConfigStatus + clusterMachineConfigPatches []*omni.ClusterMachineConfigPatches + etcdStatus *check.EtcdStatusResult + + expectError bool + expectOperations []machineset.Operation + }{ + { + name: "create nodes", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "c", machineSet), + }, + expectOperations: []machineset.Operation{ + &machineset.Create{ID: "a"}, + &machineset.Create{ID: "b"}, + &machineset.Create{ID: "c"}, + }, + }, + { + name: "create nodes when scaling down", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "c", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + tearingDown(omni.NewClusterMachine(resources.DefaultNamespace, "b")), + }, + expectOperations: []machineset.Operation{ + &machineset.Create{ID: "c"}, + }, + }, + { + name: "destroy tearing down", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + tearingDownNoFinalizers(omni.NewClusterMachine(resources.DefaultNamespace, "b")), + omni.NewClusterMachine(resources.DefaultNamespace, "c"), + }, + expectOperations: []machineset.Operation{ + &machineset.Teardown{ID: "b"}, + }, + }, + { + name: "destroy one", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + omni.NewClusterMachine(resources.DefaultNamespace, "c"), + omni.NewClusterMachine(resources.DefaultNamespace, "d"), + }, + expectOperations: []machineset.Operation{ + &machineset.Teardown{ + ID: "c", + }, + }, + etcdStatus: &check.EtcdStatusResult{ + Members: map[string]check.EtcdMemberStatus{ + "a": { + Healthy: true, + }, + "c": { + Healthy: true, + }, + }, + HealthyMembers: 2, + }, + }, + { + name: "requeue due to unhealthy etcd", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + omni.NewClusterMachine(resources.DefaultNamespace, "c"), + }, + expectError: true, + expectOperations: []machineset.Operation{}, + etcdStatus: &check.EtcdStatusResult{ + Members: map[string]check.EtcdMemberStatus{ + "a": { + Healthy: true, + }, + "c": { + Healthy: false, + }, + }, + HealthyMembers: 1, + }, + }, + { + name: "update a machine", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + }, + expectOperations: []machineset.Operation{ + &machineset.Update{ + ID: "a", + }, + }, + etcdStatus: &check.EtcdStatusResult{ + Members: map[string]check.EtcdMemberStatus{ + "a": { + Healthy: true, + }, + }, + HealthyMembers: 1, + }, + }, + { + name: "no actions", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version)), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + expectOperations: []machineset.Operation{}, + etcdStatus: &check.EtcdStatusResult{ + Members: map[string]check.EtcdMemberStatus{ + "a": { + Healthy: true, + }, + }, + HealthyMembers: 1, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + require := require.New(t) + + machineSet.TypedSpec().Value = tt.machineSet + + cluster := omni.NewCluster(resources.DefaultNamespace, tt.name) + cluster.TypedSpec().Value.TalosVersion = "v1.5.4" + cluster.TypedSpec().Value.KubernetesVersion = "v1.29.1" + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + &fakePatchHelper{}, + tt.machineSetNodes, + tt.clusterMachines, + tt.clusterMachineConfigStatuses, + tt.clusterMachineConfigPatches, + nil, + ) + + require.NoError(err) + + etcdStatus := tt.etcdStatus + if etcdStatus == nil { + etcdStatus = &check.EtcdStatusResult{} + } + + operations, err := machineset.ReconcileControlPlanes(context.Background(), rc, func(context.Context) (*check.EtcdStatusResult, error) { + return etcdStatus, nil + }) + + if !tt.expectError { + require.NoError(err) + } + + require.Equal(len(tt.expectOperations), len(operations)) + + for i, op := range operations { + expected := tt.expectOperations[i] + + switch value := op.(type) { + case *machineset.Create: + create, ok := expected.(*machineset.Create) + + require.True(ok, "the operation at %d is not create", i) + require.Equal(create.ID, value.ID) + case *machineset.Update: + update, ok := expected.(*machineset.Update) + + require.True(ok, "the operation at %d is not update", i) + require.Equal(update.ID, value.ID) + case *machineset.Teardown: + destroy, ok := expected.(*machineset.Teardown) + + require.True(ok, "the operation at %d is not destroy", i) + require.Equal(destroy.ID, value.ID) + } + } + }) + } +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go new file mode 100644 index 000000000..589474176 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset.go @@ -0,0 +1,101 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +// Package machineset contains machine set controller reconciliation code. +package machineset + +import ( + "context" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/safe" + "go.uber.org/zap" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/pkg/check" +) + +// ControllerName controller name constant. +const ControllerName = "MachineSetStatusController" + +func toSlice[T resource.Resource](list safe.List[T]) []T { + res := make([]T, 0, list.Len()) + + list.ForEach(func(t T) { + res = append(res, t) + }) + + return res +} + +// ReconcileMachines creates, updates and tears down the machines using the ReconciliationContext. +func ReconcileMachines(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) (bool, error) { + var operations []Operation + + machineSet := rc.GetMachineSet() + + _, isControlPlane := machineSet.Metadata().Labels().Get(omni.LabelControlPlaneRole) + + switch { + case machineSet.Metadata().Phase() == resource.PhaseTearingDown: + operations = ReconcileTearingDown(rc) + case isControlPlane: + var err error + + operations, err = ReconcileControlPlanes(ctx, rc, func(ctx context.Context) (*check.EtcdStatusResult, error) { + return check.EtcdStatus(ctx, r, machineSet) + }) + if err != nil { + return false, err + } + default: + operations = ReconcileWorkers(rc) + } + + for _, op := range operations { + if err := op.Apply(ctx, r, logger, rc); err != nil { + return false, err + } + } + + return len(operations) > 0, nil +} + +// UpdateFinalizers sets finalizers to all machine set nodes, adds finalizers on the Machines resources. +func UpdateFinalizers(ctx context.Context, r controller.ReaderWriter, rc *ReconciliationContext) error { + // add finalizers to all running machine set nodes + machineSetRunning := rc.GetMachineSet().Metadata().Phase() == resource.PhaseRunning + + for _, machineSetNode := range rc.GetMachineSetNodes() { + if machineSetNode.Metadata().Phase() == resource.PhaseTearingDown || !machineSetRunning { + if err := r.RemoveFinalizer(ctx, machineSetNode.Metadata(), ControllerName); err != nil { + return err + } + + continue + } + + if machineSetNode.Metadata().Finalizers().Has(ControllerName) { + continue + } + + if err := r.AddFinalizer(ctx, machineSetNode.Metadata(), ControllerName); err != nil { + return err + } + } + + // add finalizers to all machines which have running cluster machines. + for _, clusterMachine := range rc.GetClusterMachines() { + if clusterMachine.Metadata().Phase() == resource.PhaseRunning && !clusterMachine.Metadata().Finalizers().Has(ControllerName) { + if err := r.AddFinalizer(ctx, omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), ControllerName); err != nil { + return err + } + } + } + + return nil +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset_test.go new file mode 100644 index 000000000..66a734074 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/machineset_test.go @@ -0,0 +1,59 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "github.com/cosi-project/runtime/pkg/resource" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" +) + +func tearingDown[T resource.Resource](res T) T { + res = tearingDownNoFinalizers(res) + + res.Metadata().Finalizers().Add(machineset.ControllerName) + + return res +} + +func tearingDownNoFinalizers[T resource.Resource](res T) T { + res.Metadata().SetPhase(resource.PhaseTearingDown) + + return res +} + +func withVersion[T resource.Resource](res T, version resource.Version) T { + res.Metadata().SetVersion(version) + + return res +} + +func withSpecSetter[T resource.Resource](res T, f func(T)) T { + f(res) + + return res +} + +func withUpdateInputVersions[T, R resource.Resource](res T, inputs ...R) T { + helpers.UpdateInputsVersions(res, inputs...) + + return res +} + +func withClusterMachineVersionSetter(cmcs *omni.ClusterMachineConfigStatus, version resource.Version) *omni.ClusterMachineConfigStatus { + return withSpecSetter(cmcs, func(cmcs *omni.ClusterMachineConfigStatus) { + cmcs.TypedSpec().Value.ClusterMachineVersion = version.String() + }) +} + +func newHealthyLB(id string) *omni.LoadBalancerStatus { + return withSpecSetter(omni.NewLoadBalancerStatus(resources.DefaultNamespace, id), func(r *omni.LoadBalancerStatus) { + r.TypedSpec().Value.Healthy = true + }) +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations.go new file mode 100644 index 000000000..e100b17a9 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations.go @@ -0,0 +1,208 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +import ( + "context" + "fmt" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" + "go.uber.org/zap" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" +) + +// Operation is a single operation which alters the machine set. +type Operation interface { + Apply(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) error +} + +// Create the cluster machine. +type Create struct { + ID string +} + +// Apply implements Operation interface. +func (c *Create) Apply(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) error { + clusterMachine := omni.NewClusterMachine(resources.DefaultNamespace, c.ID) + clusterMachineConfigPatches := omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, c.ID) + + machineSet := rc.GetMachineSet() + configPatches := rc.GetConfigPatches(c.ID) + + helpers.CopyLabels(machineSet, clusterMachineConfigPatches, omni.LabelCluster, omni.LabelWorkerRole, omni.LabelControlPlaneRole) + helpers.CopyLabels(machineSet, clusterMachine, omni.LabelCluster, omni.LabelWorkerRole, omni.LabelControlPlaneRole) + + clusterMachine.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) + clusterMachineConfigPatches.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) + + helpers.UpdateInputsVersions(clusterMachine, configPatches...) + setPatches(clusterMachineConfigPatches, configPatches) + + clusterMachine.TypedSpec().Value.KubernetesVersion = rc.GetCluster().TypedSpec().Value.KubernetesVersion + + logger.Info("create cluster machine", zap.String("machine", c.ID)) + + if err := r.Create(ctx, clusterMachine); err != nil { + return err + } + + return r.Create(ctx, clusterMachineConfigPatches) +} + +// Teardown the cluster machine. +type Teardown struct { + Quota *ChangeQuota + ID string +} + +// Apply implements Operation interface. +func (d *Teardown) Apply(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) error { + if !d.Quota.Use(opDelete) { + logger.Info("teardown is waiting for quota", zap.String("machine", d.ID), zap.Strings("pending", Values(rc.GetTearingDownMachines()))) + + return nil + } + + logger.Info("teardown cluster machine", zap.String("machine", d.ID)) + + if _, err := r.Teardown(ctx, omni.NewClusterMachine(resources.DefaultNamespace, d.ID).Metadata()); err != nil { + return fmt.Errorf( + "error tearing down machine %q in cluster %q: %w", + d.ID, + rc.GetCluster().Metadata().ID(), + err, + ) + } + + return nil +} + +// Update the configs of the machine by updating the ClusterMachineConfigPatches. +type Update struct { + Quota *ChangeQuota + ID string +} + +// Apply implements Operation interface. +func (u *Update) Apply(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) error { + clusterMachine, ok := rc.GetClusterMachine(u.ID) + if !ok { + return fmt.Errorf("cluster machine with id %q doesn't exist", u.ID) + } + + configPatches := rc.GetConfigPatches(u.ID) + + // nothing changed in the patch list, skip any updates + if !helpers.UpdateInputsVersions(clusterMachine, configPatches...) { + return nil + } + + ignoreQuota := rc.GetUpdatingMachines().Contains(u.ID) + + // if updating we don't care about the quota + if !ignoreQuota { + if !u.Quota.Use(opUpdate) { + logger.Info("update is waiting for quota", zap.String("machine", u.ID), zap.Strings("pending", Values(rc.GetUpdatingMachines()))) + + return nil + } + } + + logger.Info("update cluster machine", zap.String("machine", u.ID), zap.Bool("ignore_quota", ignoreQuota)) + + // update ClusterMachineConfigPatches resource with the list of matching patches for the machine + err := safe.WriterModify(ctx, r, omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, u.ID), + func(clusterMachineConfigPatches *omni.ClusterMachineConfigPatches) error { + setPatches(clusterMachineConfigPatches, configPatches) + + return nil + }, + ) + if err != nil { + return err + } + + // finally update checksum for the incoming config patches in the ClusterMachine resource + return safe.WriterModify(ctx, r, clusterMachine, func(res *omni.ClusterMachine) error { + // don't update the ClusterMachine if it's still owned by another cluster + currentClusterName, ok := res.Metadata().Labels().Get(omni.LabelCluster) + if ok && currentClusterName != rc.cluster.Metadata().ID() { + return nil + } + + helpers.CopyAllAnnotations(clusterMachine, res) + + if res.TypedSpec().Value.KubernetesVersion == "" { + res.TypedSpec().Value.KubernetesVersion = rc.GetCluster().TypedSpec().Value.KubernetesVersion + } + + return nil + }) +} + +// Destroy cleans up the cluster machines without finalizers. +type Destroy struct { + ID string +} + +// Apply implements Operation interface. +func (d *Destroy) Apply(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *ReconciliationContext) error { + clusterMachine, ok := rc.GetClusterMachine(d.ID) + if !ok { + return fmt.Errorf("machine with id %q doesn't exist in the state", d.ID) + } + + if clusterMachine.Metadata().Phase() == resource.PhaseRunning { + return nil + } + + if !clusterMachine.Metadata().Finalizers().Empty() { + return nil + } + + configPatches := omni.NewClusterMachineConfigPatches(clusterMachine.Metadata().Namespace(), clusterMachine.Metadata().ID()) + + if err := r.Destroy(ctx, configPatches.Metadata()); err != nil && !state.IsNotFoundError(err) { + return err + } + + if err := r.Destroy(ctx, clusterMachine.Metadata()); err != nil && !state.IsNotFoundError(err) { + return err + } + + // release the Machine finalizer + if err := r.RemoveFinalizer( + ctx, + omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), + ControllerName, + ); err != nil && !state.IsNotFoundError(err) { + return fmt.Errorf("error removing finalizer from machine %q: %w", clusterMachine.Metadata().ID(), err) + } + + logger.Info("deleted the machine", + zap.String("machine", clusterMachine.Metadata().ID()), + ) + + return nil +} + +func setPatches( + clusterMachineConfigPatches *omni.ClusterMachineConfigPatches, + patches []*omni.ConfigPatch, +) { + patchesRaw := make([]string, 0, len(patches)) + for _, p := range patches { + patchesRaw = append(patchesRaw, p.TypedSpec().Value.Data) + } + + clusterMachineConfigPatches.TypedSpec().Value.Patches = patchesRaw +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations_test.go new file mode 100644 index 000000000..9902b55bc --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/operations_test.go @@ -0,0 +1,436 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "context" + "testing" + "time" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/resource/rtestutils" + "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" + "github.com/cosi-project/runtime/pkg/state/impl/inmem" + "github.com/cosi-project/runtime/pkg/state/impl/namespaced" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" +) + +var logger *zap.Logger + +type runtime struct { + state.State +} + +func (rt *runtime) Create(ctx context.Context, r resource.Resource) error { + return rt.State.Create(ctx, r) +} + +func (rt *runtime) Destroy(ctx context.Context, r resource.Pointer, _ ...controller.Option) error { + return rt.State.Destroy(ctx, r) +} + +func (rt *runtime) Modify(ctx context.Context, r resource.Resource, f func(resource.Resource) error) error { + _, err := rt.ModifyWithResult(ctx, r, f) + + return err +} + +func (rt *runtime) Update(ctx context.Context, r resource.Resource) error { + return rt.State.Update(ctx, r) +} + +func (rt *runtime) ModifyWithResult(ctx context.Context, r resource.Resource, f func(resource.Resource) error) (resource.Resource, error) { + res, err := rt.State.UpdateWithConflicts(ctx, r.Metadata(), f) + if state.IsNotFoundError(err) { + res = r.DeepCopy() + + if err = f(res); err != nil { + return nil, err + } + + return res, rt.State.Create(ctx, res) + } + + return res, err +} + +func (rt *runtime) Teardown(ctx context.Context, r resource.Pointer, _ ...controller.Option) (bool, error) { + return rt.State.Teardown(ctx, r) +} + +func createRuntime() *runtime { + state := state.WrapCore(namespaced.NewState(inmem.Build)) + + return &runtime{state} +} + +// TestCreate runs create once, checks that both cluster machine and cluster machine config status resources were created. +// Validate that both resources have the right data in the spec. +func TestCreate(t *testing.T) { + rt := createRuntime() + + cluster := omni.NewCluster(resources.DefaultNamespace, "test") + cluster.TypedSpec().Value.KubernetesVersion = "v1.6.2" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machineset") + + create := machineset.Create{ID: "aa"} + + require := require.New(t) + + ctx := context.Background() + + patch := omni.NewConfigPatch(resources.DefaultNamespace, "some") + patch.TypedSpec().Value.Data = `machine: + network: + kubespan: + enabled: true` + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + &fakePatchHelper{ + patches: map[string][]*omni.ConfigPatch{ + "aa": { + patch, + }, + }, + }, + []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "aa", machineSet), + }, + nil, + nil, + nil, + nil, + ) + require.NoError(err) + + clusterMachine := omni.NewClusterMachine(resources.DefaultNamespace, "aa") + + helpers.UpdateInputsVersions(clusterMachine, patch) + + inputsSHA, ok := clusterMachine.Metadata().Annotations().Get(helpers.InputResourceVersionAnnotation) + require.True(ok) + + require.NoError(create.Apply(ctx, rt, logger, rc)) + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + + actualInputsSHA, ok := clusterMachine.Metadata().Annotations().Get(helpers.InputResourceVersionAnnotation) + require.True(ok) + require.Equal(inputsSHA, actualInputsSHA) + + require.NoError(err) + + require.NotEmpty(clusterMachine.TypedSpec().Value.KubernetesVersion) + + clusterMachineConfigPatches, err := safe.ReaderGetByID[*omni.ClusterMachineConfigPatches](ctx, rt, "aa") + + require.NoError(err) + + require.NotEmpty(clusterMachineConfigPatches.TypedSpec().Value.Patches) +} + +// TestUpdate run update 4 times: +// - quota is 2, the machine has 1 patch. +// - quota is 1, the machine should have 2 patches, verify that config patches resource was synced. +// - quota is 0, the machine is still updating, so the update should work. +// - quota is 0, the machine config status is synced, no update should happen. +func TestUpdate(t *testing.T) { + rt := createRuntime() + + cluster := omni.NewCluster(resources.DefaultNamespace, "test") + cluster.TypedSpec().Value.KubernetesVersion = "v1.6.4" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machineset") + + quota := &machineset.ChangeQuota{ + Update: 2, + } + + update := machineset.Update{ID: "aa", Quota: quota} + + require := require.New(t) + + ctx := context.Background() + + patch1 := omni.NewConfigPatch(resources.DefaultNamespace, "some") + patch1.TypedSpec().Value.Data = `machine: + network: + kubespan: + enabled: true` + + patch2 := omni.NewConfigPatch(resources.DefaultNamespace, "some") + patch2.TypedSpec().Value.Data = `machine: + network: + hostname: some` + + patchHelper := &fakePatchHelper{ + patches: map[string][]*omni.ConfigPatch{ + "aa": { + patch1, + }, + }, + } + + clusterMachine := omni.NewClusterMachine(resources.DefaultNamespace, "aa") + clusterMachine.Metadata().SetVersion(resource.VersionUndefined.Next()) + + configStatus := omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "aa") + configStatus.TypedSpec().Value.ClusterMachineVersion = clusterMachine.Metadata().Version().String() + + var ( + rc *machineset.ReconciliationContext + err error + ) + + updateReconciliationContext := func() { + rc, err = machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + patchHelper, + []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "aa", machineSet), + }, + []*omni.ClusterMachine{ + clusterMachine, + }, + []*omni.ClusterMachineConfigStatus{ + configStatus, + }, + []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "aa"), + }, + nil, + ) + require.NoError(err) + } + + updateReconciliationContext() + + require.NoError(update.Apply(ctx, rt, logger, rc)) + + require.Equal(1, quota.Update) + + patchHelper.patches["aa"] = append(patchHelper.patches["aa"], patch2) + + updateReconciliationContext() + + clusterMachine = omni.NewClusterMachine(resources.DefaultNamespace, "aa") + helpers.UpdateInputsVersions(clusterMachine, patch1, patch2) + + inputsSHA, ok := clusterMachine.Metadata().Annotations().Get(helpers.InputResourceVersionAnnotation) + require.True(ok) + + require.NoError(update.Apply(ctx, rt, logger, rc)) + + require.Equal(0, quota.Update) + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + + actualInputsSHA, ok := clusterMachine.Metadata().Annotations().Get(helpers.InputResourceVersionAnnotation) + require.True(ok) + require.Equal(inputsSHA, actualInputsSHA) + + require.NoError(err) + + require.NotEmpty(clusterMachine.TypedSpec().Value.KubernetesVersion) + + clusterMachineConfigPatches, err := safe.ReaderGetByID[*omni.ClusterMachineConfigPatches](ctx, rt, "aa") + + require.NoError(err) + + require.NotEmpty(clusterMachineConfigPatches.TypedSpec().Value.Patches) + + patchHelper.patches["aa"] = append(patchHelper.patches["aa"], patch1) + + updateReconciliationContext() + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + version := clusterMachine.Metadata().Version() + + // update should happen as the machine update is still pending + require.NoError(update.Apply(ctx, rt, logger, rc)) + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + require.False(clusterMachine.Metadata().Version().Equal(version)) + + version = clusterMachine.Metadata().Version() + + // simulate config synced + configStatus.TypedSpec().Value.ClusterMachineVersion = clusterMachine.Metadata().Version().String() + + updateReconciliationContext() + + // update shouldn't happen as the quota reached + require.NoError(update.Apply(ctx, rt, logger, rc)) + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + require.True(clusterMachine.Metadata().Version().Equal(version)) +} + +// TestTeardown create 2 cluster machine, destroy with quota 1, first should proceed, second should skip. +func TestTeardown(t *testing.T) { + rt := createRuntime() + + cluster := omni.NewCluster(resources.DefaultNamespace, "test") + cluster.TypedSpec().Value.KubernetesVersion = "v1.6.3" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machineset") + + quota := machineset.ChangeQuota{ + Teardown: 1, + } + + require := require.New(t) + + ctx := context.Background() + + clusterMachines := []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "aa"), + omni.NewClusterMachine(resources.DefaultNamespace, "bb"), + } + + for _, cm := range clusterMachines { + cm.Metadata().Labels().Set(omni.LabelCluster, cluster.Metadata().ID()) + cm.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) + + require.NoError(rt.Create(ctx, cm)) + } + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + &fakePatchHelper{}, + []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "aa", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "bb", machineSet), + }, + clusterMachines, + nil, + nil, + nil, + ) + require.NoError(err) + + teardown := machineset.Teardown{ID: "aa", Quota: "a} + require.NoError(teardown.Apply(ctx, rt, logger, rc)) + + require.Equal(0, quota.Teardown) + + teardown = machineset.Teardown{ID: "bb", Quota: "a} + require.NoError(teardown.Apply(ctx, rt, logger, rc)) + + require.Equal(0, quota.Teardown) + + clusterMachine, err := safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "aa") + + require.NoError(err) + require.Equal(resource.PhaseTearingDown, clusterMachine.Metadata().Phase()) + + clusterMachine, err = safe.ReaderGetByID[*omni.ClusterMachine](ctx, rt, "bb") + + require.NoError(err) + require.Equal(resource.PhaseRunning, clusterMachine.Metadata().Phase()) +} + +// TestDestroy create tearing down machines, destroy them, should have no resources after the operation is complete. +func TestDestroy(t *testing.T) { + rt := createRuntime() + + cluster := omni.NewCluster(resources.DefaultNamespace, "test") + cluster.TypedSpec().Value.KubernetesVersion = "v1.6.3" + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "machineset") + + require := require.New(t) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + clusterMachines := []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "aa"), + omni.NewClusterMachine(resources.DefaultNamespace, "bb"), + } + + for _, cm := range clusterMachines { + cm.Metadata().Labels().Set(omni.LabelCluster, cluster.Metadata().ID()) + cm.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) + + cmcp := omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, cm.Metadata().ID()) + helpers.CopyAllLabels(cm, cmcp) + + machine := omni.NewMachine(resources.DefaultNamespace, cm.Metadata().ID()) + machine.Metadata().Finalizers().Add(machineset.ControllerName) + + require.NoError(rt.Create(ctx, cm)) + require.NoError(rt.Create(ctx, cmcp)) + require.NoError(rt.Create(ctx, machine)) + + _, err := rt.Teardown(ctx, cm.Metadata()) + require.NoError(err) + + cm.Metadata().SetPhase(resource.PhaseTearingDown) + } + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + &fakePatchHelper{}, + nil, + clusterMachines, + nil, + nil, + nil, + ) + require.NoError(err) + + destroy := machineset.Destroy{ID: "aa"} + require.NoError(destroy.Apply(ctx, rt, logger, rc)) + + destroy = machineset.Destroy{ID: "bb"} + require.NoError(destroy.Apply(ctx, rt, logger, rc)) + + rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, rt.State, "aa") + rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, rt.State, "bb") + rtestutils.AssertNoResource[*omni.ClusterMachineConfigPatches](ctx, t, rt.State, "aa") + rtestutils.AssertNoResource[*omni.ClusterMachineConfigPatches](ctx, t, rt.State, "bb") + rtestutils.AssertResources(ctx, t, rt.State, []string{"aa", "bb"}, func(r *omni.Machine, assertion *assert.Assertions) { + assertion.True(r.Metadata().Finalizers().Empty()) + }) +} + +func init() { + var err error + + logger, err = zap.NewDevelopment() + if err != nil { + panic(err) + } +} + +func init() { + var err error + + logger, err = zap.NewDevelopment() + if err != nil { + panic(err) + } +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context.go new file mode 100644 index 000000000..3150b4e7b --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context.go @@ -0,0 +1,489 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +import ( + "context" + "fmt" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/controller/generic/qtransform" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/safe" + "github.com/cosi-project/runtime/pkg/state" + "github.com/siderolabs/gen/xerrors" + "github.com/siderolabs/gen/xslices" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/configpatch" +) + +const ( + opDelete = iota + opUpdate +) + +// ChangeQuota defines allowed number of machine deletes and update for a single reconcile call. +type ChangeQuota struct { + Teardown int + Update int +} + +// Use decreases quota type by 1 if it's > 0 or returns false if no more op of the kind are available. +func (q *ChangeQuota) Use(op int) bool { + if q == nil { + return true + } + + var qvalue *int + + switch op { + case opDelete: + qvalue = &q.Teardown + case opUpdate: + qvalue = &q.Update + default: + panic(fmt.Sprintf("unknown op kind %d", op)) + } + + if *qvalue == -1 { + return true + } + + if *qvalue == 0 { + return false + } + + *qvalue-- + + return true +} + +// ReconciliationContext describes all related data for one reconciliation call of the machine set status controller. +type ReconciliationContext struct { + machineSet *omni.MachineSet + cluster *omni.Cluster + patchesByMachine map[resource.ID][]*omni.ConfigPatch + machineSetNodesMap map[resource.ID]*omni.MachineSetNode + clusterMachinesMap map[resource.ID]*omni.ClusterMachine + clusterMachineConfigStatusesMap map[resource.ID]*omni.ClusterMachineConfigStatus + clusterMachineConfigPatchesMap map[resource.ID]*omni.ClusterMachineConfigPatches + clusterMachineStatusesMap map[resource.ID]*omni.ClusterMachineStatus + + runningMachineSetNodesSet Set[string] + idsTearingDown Set[string] + idsUnconfigured Set[string] + idsOutdated Set[string] + idsDestroyReady Set[string] + + idsToTeardown []string + idsToCreate []string + idsToUpdate []string + idsToDestroy []string + + lbHealthy bool +} + +type patchHelper interface { + Get(*omni.ClusterMachine, *omni.MachineSet) ([]*omni.ConfigPatch, error) +} + +// BuildReconciliationContext is the COSI reader dependent method to build the reconciliation context. +func BuildReconciliationContext( + ctx context.Context, r controller.Reader, machineSet *omni.MachineSet, +) (*ReconciliationContext, error) { + clusterName, ok := machineSet.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return nil, fmt.Errorf("failed to determine the cluster of the machine set %q", machineSet.Metadata().ID()) + } + + cluster, err := safe.ReaderGetByID[*omni.Cluster](ctx, r, clusterName) + if err != nil { + if state.IsNotFoundError(err) { + return nil, xerrors.NewTagged[qtransform.SkipReconcileTag](err) + } + + return nil, err + } + + loadBalancerStatus, err := safe.ReaderGetByID[*omni.LoadBalancerStatus](ctx, r, clusterName) + if err != nil && !state.IsNotFoundError(err) { + return nil, err + } + + query := state.WithLabelQuery( + resource.LabelEqual( + omni.LabelMachineSet, + machineSet.Metadata().ID(), + ), + ) + + clusterMachines, err := safe.ReaderListAll[*omni.ClusterMachine](ctx, r, query) + if err != nil { + return nil, fmt.Errorf("failed to list cluster machines for the machine set %q: %w", machineSet.Metadata().ID(), err) + } + + machineSetNodes, err := safe.ReaderListAll[*omni.MachineSetNode](ctx, r, query) + if err != nil { + return nil, fmt.Errorf("failed to list machine set nodes for the machine set %q: %w", machineSet.Metadata().ID(), err) + } + + clusterMachineConfigStatuses, err := safe.ReaderListAll[*omni.ClusterMachineConfigStatus](ctx, r, query) + if err != nil { + return nil, fmt.Errorf("failed to list cluster machine config statuses for the machine set %q: %w", machineSet.Metadata().ID(), err) + } + + clusterMachineConfigPatches, err := safe.ReaderListAll[*omni.ClusterMachineConfigPatches](ctx, r, query) + if err != nil { + return nil, fmt.Errorf("failed to list cluster machine config patches for the machine set %q: %w", machineSet.Metadata().ID(), err) + } + + clusterMachineStatuses, err := safe.ReaderListAll[*omni.ClusterMachineStatus](ctx, r, query) + if err != nil { + return nil, fmt.Errorf("failed to list cluster machine config statuses for the machine set %q: %w", machineSet.Metadata().ID(), err) + } + + configPatchHelper, err := configpatch.NewHelper(ctx, r) + if err != nil { + return nil, fmt.Errorf("error creating config patch helper: %w", err) + } + + return NewReconciliationContext( + cluster, + machineSet, + loadBalancerStatus, + configPatchHelper, + toSlice(machineSetNodes), + toSlice(clusterMachines), + toSlice(clusterMachineConfigStatuses), + toSlice(clusterMachineConfigPatches), + toSlice(clusterMachineStatuses), + ) +} + +// NewReconciliationContext creates new state for machine set status controller reconciliation flow. +func NewReconciliationContext( + cluster *omni.Cluster, + machineSet *omni.MachineSet, + loadbalancerStatus *omni.LoadBalancerStatus, + patchHelper patchHelper, + machineSetNodes []*omni.MachineSetNode, + clusterMachines []*omni.ClusterMachine, + clusterMachineConfigStatuses []*omni.ClusterMachineConfigStatus, + clusterMachineConfigPatches []*omni.ClusterMachineConfigPatches, + clusterMachineStatuses []*omni.ClusterMachineStatus, +) (*ReconciliationContext, error) { + rc := &ReconciliationContext{ + machineSet: machineSet, + cluster: cluster, + patchesByMachine: map[resource.ID][]*omni.ConfigPatch{}, + } + + checkLocked := func(r *omni.MachineSetNode) bool { + _, ok := r.Metadata().Annotations().Get(omni.MachineLocked) + + return ok + } + + checkTearingDown := func(r *omni.ClusterMachine) bool { + return r.Metadata().Phase() == resource.PhaseTearingDown + } + + checkRunning := func(r *omni.MachineSetNode) bool { + return r.Metadata().Phase() == resource.PhaseRunning + } + + rc.clusterMachinesMap = toMap(clusterMachines) + rc.clusterMachineConfigStatusesMap = toMap(clusterMachineConfigStatuses) + rc.clusterMachineConfigPatchesMap = toMap(clusterMachineConfigPatches) + rc.clusterMachineStatusesMap = toMap(clusterMachineStatuses) + rc.machineSetNodesMap = toMap(machineSetNodes) + rc.runningMachineSetNodesSet = toSet(xslices.Filter(machineSetNodes, checkRunning)) + + clusterMachinesSet := toSet(clusterMachines) + lockedMachinesSet := toSet(xslices.Filter(machineSetNodes, checkLocked)) + tearingDownMachinesSet := toSet(xslices.Filter(clusterMachines, checkTearingDown)) + rc.idsDestroyReady = toSet(xslices.Filter(clusterMachines, func(clusterMachine *omni.ClusterMachine) bool { + return clusterMachine.Metadata().Phase() == resource.PhaseTearingDown && clusterMachine.Metadata().Finalizers().Empty() + })) + + // cluster machines + rc.idsToDestroy = Values(rc.idsDestroyReady) + + // if tearing down then all machines need to be torn down + if machineSet.Metadata().Phase() == resource.PhaseTearingDown { + rc.idsToTeardown = Values( + Difference( + clusterMachinesSet, + tearingDownMachinesSet, + ), + ) + + return rc, nil + } + + rc.idsToTeardown = Values( + Difference( + clusterMachinesSet, + tearingDownMachinesSet, + rc.runningMachineSetNodesSet, + lockedMachinesSet, + ), + ) + + rc.idsToCreate = Values(Difference(rc.runningMachineSetNodesSet, clusterMachinesSet)) + + rc.idsTearingDown = Difference(tearingDownMachinesSet, rc.idsDestroyReady) + + updateCandidates := Values( + Difference( + Intersection( + rc.runningMachineSetNodesSet, + clusterMachinesSet, + ), + tearingDownMachinesSet, + lockedMachinesSet, + ), + ) + + for id := range Union(rc.runningMachineSetNodesSet, clusterMachinesSet) { + clusterMachine := omni.NewClusterMachine(resources.DefaultNamespace, id) + + helpers.CopyAllLabels(machineSet, clusterMachine) + + clusterMachine.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) + + patches, err := patchHelper.Get(clusterMachine, machineSet) + if err != nil { + return nil, err + } + + rc.patchesByMachine[clusterMachine.Metadata().ID()] = patches + } + + for _, id := range updateCandidates { + clusterMachine := rc.clusterMachinesMap[id].DeepCopy().(*omni.ClusterMachine) //nolint:forcetypeassert,errcheck + + _, ok := rc.clusterMachineConfigPatchesMap[id] + if !ok { + rc.idsToUpdate = append(rc.idsToUpdate, id) + + continue + } + + patches := rc.patchesByMachine[id] + + if helpers.UpdateInputsVersions(clusterMachine, patches...) { + rc.idsToUpdate = append(rc.idsToUpdate, id) + } + } + + rc.idsOutdated = make(Set[string]) + rc.idsUnconfigured = make(Set[string]) + + for id := range Difference(clusterMachinesSet, tearingDownMachinesSet) { + clusterMachineConfigStatus, ok := rc.clusterMachineConfigStatusesMap[id] + if !ok { + rc.idsUnconfigured.Add(id) + + continue + } + + if isUpdating(rc.clusterMachinesMap[id], clusterMachineConfigStatus) { + rc.idsOutdated.Add(id) + } + } + + if loadbalancerStatus != nil { + rc.lbHealthy = loadbalancerStatus.TypedSpec().Value.Healthy + } + + return rc, nil +} + +// GetMachinesToTeardown returns all machine IDs which have ClusterMachine but no MachineSetNode. +func (rc *ReconciliationContext) GetMachinesToTeardown() []string { + return rc.idsToTeardown +} + +// GetMachinesToDestroy returns all machines ready to be destroyed. +func (rc *ReconciliationContext) GetMachinesToDestroy() []string { + return rc.idsToDestroy +} + +// GetMachinesToCreate returns all machine IDs which have MachineSetNode but no ClusterMachine. +func (rc *ReconciliationContext) GetMachinesToCreate() []string { + return rc.idsToCreate +} + +// GetMachinesToUpdate returns all machine IDs which have outdated config patches. +func (rc *ReconciliationContext) GetMachinesToUpdate() []string { + return rc.idsToUpdate +} + +// GetTearingDownMachines returns all ClusterMachines in TearingDown phase. +func (rc *ReconciliationContext) GetTearingDownMachines() Set[string] { + return rc.idsTearingDown +} + +// GetUpdatingMachines returns all ClusterMachines which have outdated config patches, or not configured at all. +func (rc *ReconciliationContext) GetUpdatingMachines() Set[string] { + return Union( + rc.idsOutdated, + rc.idsUnconfigured, + ) +} + +// GetOutdatedMachines returns the list of machines which are currently being configured. +func (rc *ReconciliationContext) GetOutdatedMachines() Set[string] { + return rc.idsOutdated +} + +// GetCluster reads the related cluster resource. +func (rc *ReconciliationContext) GetCluster() *omni.Cluster { + return rc.cluster +} + +// GetMachineSet reads the related machine set resource. +func (rc *ReconciliationContext) GetMachineSet() *omni.MachineSet { + return rc.machineSet +} + +// GetMachineSetNodes reads the related machine set nodes resources. +func (rc *ReconciliationContext) GetMachineSetNodes() map[resource.ID]*omni.MachineSetNode { + return rc.machineSetNodesMap +} + +// GetClusterMachines reads the related machine set resources. +func (rc *ReconciliationContext) GetClusterMachines() map[resource.ID]*omni.ClusterMachine { + return rc.clusterMachinesMap +} + +// GetRunningClusterMachines gets all cluster machines except destroy ready ones. +func (rc *ReconciliationContext) GetRunningClusterMachines() map[resource.ID]*omni.ClusterMachine { + machines := make(map[resource.ID]*omni.ClusterMachine, len(rc.clusterMachinesMap)-len(rc.idsToDestroy)) + + for id, cm := range rc.clusterMachinesMap { + if rc.idsDestroyReady.Contains(id) { + continue + } + + machines[id] = cm + } + + return machines +} + +// GetRunningMachineSetNodes gets all machine set nodes in running phase. +func (rc *ReconciliationContext) GetRunningMachineSetNodes() map[resource.ID]*omni.MachineSetNode { + machines := make(map[resource.ID]*omni.MachineSetNode, len(rc.runningMachineSetNodesSet)) + + for id, cm := range rc.machineSetNodesMap { + if !rc.runningMachineSetNodesSet.Contains(id) { + continue + } + + machines[id] = cm + } + + return machines +} + +// GetClusterMachineStatuses reads the related machine set resources. +func (rc *ReconciliationContext) GetClusterMachineStatuses() map[resource.ID]*omni.ClusterMachineStatus { + return rc.clusterMachineStatusesMap +} + +// GetClusterMachineConfigStatuses reads the related machine set resources. +func (rc *ReconciliationContext) GetClusterMachineConfigStatuses() map[resource.ID]*omni.ClusterMachineConfigStatus { + return rc.clusterMachineConfigStatusesMap +} + +// GetClusterMachine by the id. +func (rc *ReconciliationContext) GetClusterMachine(id resource.ID) (*omni.ClusterMachine, bool) { + cm, ok := rc.clusterMachinesMap[id] + + return cm, ok +} + +// GetClusterMachineConfigStatus by the id. +func (rc *ReconciliationContext) GetClusterMachineConfigStatus(id resource.ID) (*omni.ClusterMachineConfigStatus, bool) { + cm, ok := rc.clusterMachineConfigStatusesMap[id] + + return cm, ok +} + +// GetConfigPatches reads previosly collected confignpatches for a machine. +func (rc *ReconciliationContext) GetConfigPatches(id resource.ID) []*omni.ConfigPatch { + return rc.patchesByMachine[id] +} + +// LBHealthy returns the health status of the loadbalancer for the current cluster. +func (rc *ReconciliationContext) LBHealthy() bool { + return rc.lbHealthy +} + +// CalculateQuota computes limits for scale down and update basing on the machine set max update parallelism and machine set role. +func (rc *ReconciliationContext) CalculateQuota() ChangeQuota { + var ( + quota ChangeQuota + machineSetSpec = rc.machineSet.TypedSpec().Value + ) + + quota.Teardown = getParallelismOrDefault(machineSetSpec.DeleteStrategy, machineSetSpec.DeleteStrategyConfig, -1) + quota.Update = getParallelismOrDefault(machineSetSpec.UpdateStrategy, machineSetSpec.UpdateStrategyConfig, 1) + + // final delete quota is MaxParallelism minus machines in tearing down phase + if quota.Teardown > 0 { + quota.Teardown -= len(rc.idsTearingDown) + + if quota.Teardown < 0 { + quota.Teardown = 0 + } + } + + // final update quota is MaxParallelism minus currently updated machines count + if quota.Update > 0 { + quota.Update -= len(rc.GetUpdatingMachines()) + + if quota.Update < 0 { + quota.Update = 0 + } + } + + return quota +} + +func getParallelismOrDefault(strategyType specs.MachineSetSpec_UpdateStrategy, strategy *specs.MachineSetSpec_UpdateStrategyConfig, def int) int { + if strategyType == specs.MachineSetSpec_Rolling { + if strategy == nil { + return def + } + + return int(strategy.Rolling.MaxParallelism) + } + + return def +} + +func isUpdating(clusterMachine *omni.ClusterMachine, clusterMachineConfigStatus *omni.ClusterMachineConfigStatus) bool { + return clusterMachineConfigStatus.TypedSpec().Value.ClusterMachineVersion != clusterMachine.Metadata().Version().String() || clusterMachineConfigStatus.TypedSpec().Value.LastConfigError != "" +} + +func toSet[T resource.Resource](resources []T) Set[resource.ID] { + return Set[resource.ID](xslices.ToSetFunc(resources, func(r T) resource.ID { + return r.Metadata().ID() + })) +} + +func toMap[T resource.Resource](resources []T) map[resource.ID]T { + return xslices.ToMap(resources, func(r T) (resource.ID, T) { + return r.Metadata().ID(), r + }) +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context_test.go new file mode 100644 index 000000000..4fbd49fc3 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/reconciliation_context_test.go @@ -0,0 +1,364 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "testing" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" +) + +type fakePatchHelper struct { + patches map[string][]*omni.ConfigPatch +} + +func (fph *fakePatchHelper) Get(cm *omni.ClusterMachine, _ *omni.MachineSet) ([]*omni.ConfigPatch, error) { + if fph.patches == nil { + return nil, nil + } + + return fph.patches[cm.Metadata().ID()], nil +} + +//nolint:maintidx +func TestReconciliationContext(t *testing.T) { + t.Parallel() + + tearingDownMachine := omni.NewClusterMachine(resources.DefaultNamespace, "a") + tearingDownMachine.Metadata().SetPhase(resource.PhaseTearingDown) + + updatedMachine := omni.NewClusterMachine(resources.DefaultNamespace, "a") + updatedMachine.Metadata().SetVersion(resource.VersionUndefined.Next().Next()) + + lockedMachine := omni.NewMachineSetNode(resources.DefaultNamespace, "b", omni.NewMachineSet("", "")) + lockedMachine.Metadata().Annotations().Set(omni.MachineLocked, "") + + synced := omni.NewClusterMachine(resources.DefaultNamespace, "a") + helpers.UpdateInputsAnnotation(synced) + + var configPatches []*omni.ConfigPatch + + version := resource.VersionUndefined.Next() + + //nolint:govet + for _, tt := range []struct { + name string + machineSet *specs.MachineSetSpec + lbUnhealthy bool + machineSetNodes []*omni.MachineSetNode + clusterMachines []*omni.ClusterMachine + clusterMachineConfigStatuses []*omni.ClusterMachineConfigStatus + clusterMachineConfigPatches []*omni.ClusterMachineConfigPatches + expectedQuota machineset.ChangeQuota + expectedTearingDown []string + expectedUpdating []string + + expectedToUpdate []string + expectedToCreate []string + expectedToTeardown []string + expectedToDestroy []string + }{ + { + name: "rolling no machines", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategyConfig: &specs.MachineSetSpec_UpdateStrategyConfig{ + Rolling: &specs.MachineSetSpec_RollingUpdateStrategyConfig{ + MaxParallelism: 1, + }, + }, + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: 1, + Update: 1, + }, + }, + { + name: "running machines", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", omni.NewMachineSet("", "")), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version), configPatches...), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + }, + { + name: "running machines 1 to update", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + clusterMachines: []*omni.ClusterMachine{ + withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version), + }, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", omni.NewMachineSet("", "")), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + expectedToUpdate: []string{"a"}, + }, + { + name: "destroy machines", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + machineSetNodes: []*omni.MachineSetNode{ + tearingDown(omni.NewMachineSetNode(resources.DefaultNamespace, "a", newMachineSet(1))), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version), configPatches...), + withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "b"), version), configPatches...), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "b"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "b"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + expectedToTeardown: []string{"a", "b"}, + }, + { + name: "update locked noop", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Rolling, + }, + machineSetNodes: []*omni.MachineSetNode{ + lockedMachine, + }, + clusterMachines: []*omni.ClusterMachine{ + withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "b"), version), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "b"), version), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + }, + { + name: "update locked quota", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Rolling, + }, + machineSetNodes: []*omni.MachineSetNode{ + lockedMachine, + omni.NewMachineSetNode(resources.DefaultNamespace, "c", omni.NewMachineSet("", "")), + }, + clusterMachines: []*omni.ClusterMachine{ + withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "b"), version), + withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "c"), version), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "b"), version), + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "c"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "b"), + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "c"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + expectedToUpdate: []string{"c"}, + }, + { + name: "tearing down machines", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategyConfig: &specs.MachineSetSpec_UpdateStrategyConfig{ + Rolling: &specs.MachineSetSpec_RollingUpdateStrategyConfig{ + MaxParallelism: 1, + }, + }, + }, + clusterMachines: []*omni.ClusterMachine{ + tearingDown(withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version), configPatches...)), + withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "b"), version), configPatches...), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "b"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "b"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: 0, + Update: 1, + }, + expectedToTeardown: []string{"b"}, + expectedTearingDown: []string{"a"}, + }, + { + name: "1 updating", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + }, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", omni.NewMachineSet("", "")), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 0, + }, + expectedToUpdate: []string{"a"}, + expectedUpdating: []string{"a"}, + }, + { + name: "workers tearing down rolling 3 in parallel", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + UpdateStrategyConfig: &specs.MachineSetSpec_UpdateStrategyConfig{ + Rolling: &specs.MachineSetSpec_RollingUpdateStrategyConfig{ + MaxParallelism: 3, + }, + }, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + clusterMachines: []*omni.ClusterMachine{ + tearingDown(withUpdateInputVersions(withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version), configPatches...)), + }, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", omni.NewMachineSet("", "")), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 3, + }, + expectedTearingDown: []string{"a"}, + }, + { + name: "destroy without finalizers", + machineSet: &specs.MachineSetSpec{ + UpdateStrategy: specs.MachineSetSpec_Rolling, + DeleteStrategy: specs.MachineSetSpec_Unset, + }, + clusterMachines: []*omni.ClusterMachine{ + tearingDownNoFinalizers(omni.NewClusterMachine(resources.DefaultNamespace, "a")), + }, + expectedQuota: machineset.ChangeQuota{ + Teardown: -1, + Update: 1, + }, + expectedToDestroy: []string{"a"}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + require := require.New(t) + assert := assert.New(t) + + machineSet := omni.NewMachineSet(resources.DefaultNamespace, tt.name) + machineSet.TypedSpec().Value = tt.machineSet + + cluster := omni.NewCluster(resources.DefaultNamespace, tt.name) + cluster.TypedSpec().Value.TalosVersion = "v1.6.4" + cluster.TypedSpec().Value.KubernetesVersion = "v1.29.0" + + var loadbalancerStatus *omni.LoadBalancerStatus + + if !tt.lbUnhealthy { + loadbalancerStatus = omni.NewLoadBalancerStatus(resources.DefaultNamespace, tt.name) + loadbalancerStatus.TypedSpec().Value.Healthy = true + } + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + loadbalancerStatus, + &fakePatchHelper{}, + tt.machineSetNodes, + tt.clusterMachines, + tt.clusterMachineConfigStatuses, + tt.clusterMachineConfigPatches, + nil, + ) + + require.NoError(err) + + q := rc.CalculateQuota() + + assert.EqualValues(tt.expectedQuota, q) + + assert.EqualValues(tt.expectedToCreate, rc.GetMachinesToCreate(), "machines to create do not match") + assert.EqualValues(tt.expectedToTeardown, rc.GetMachinesToTeardown(), "machines to destroy do not match") + assert.EqualValues(tt.expectedToUpdate, rc.GetMachinesToUpdate(), "machines to update do not match") + + updating := rc.GetUpdatingMachines() + assert.EqualValues(len(tt.expectedUpdating), len(updating), "updating machines do not match") + + for _, id := range tt.expectedUpdating { + assert.True(updating.Contains(id)) + } + + tearingDown := rc.GetTearingDownMachines() + assert.EqualValues(len(tt.expectedTearingDown), len(tearingDown), "tearing down machines do not match") + + for _, id := range tt.expectedTearingDown { + assert.True(tearingDown.Contains(id)) + } + }) + } +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/set.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/set.go new file mode 100644 index 000000000..038d135f8 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/set.go @@ -0,0 +1,88 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +import ( + "cmp" + "slices" + + "github.com/siderolabs/gen/maps" +) + +// Set implements a generic set used in the machine set reconciliation context +// which relies on getting diff and intersections of machine set nodes, cluster machines, +// machines running in different state to produce lists of machines for update, create and destroy. +type Set[K cmp.Ordered] map[K]struct{} + +// Contains checks if the set has value. +func (s Set[K]) Contains(value K) bool { + _, ok := s[value] + + return ok +} + +// Add adds a value to the set. +func (s Set[K]) Add(value K) { + s[value] = struct{}{} +} + +// Difference calculates difference between a set and number of other sets. +func Difference[K cmp.Ordered](a Set[K], other ...Set[K]) Set[K] { + res := make(Set[K], len(a)) + +outer: + for k := range a { + for _, b := range other { + if _, ok := b[k]; ok { + continue outer + } + } + + res[k] = struct{}{} + } + + return res +} + +// Intersection calculates intersections of a set with all other sets. +func Intersection[K cmp.Ordered](a Set[K], other ...Set[K]) Set[K] { + res := make(Set[K], len(a)) + +outer: + for k := range a { + for _, b := range other { + if _, ok := b[k]; !ok { + continue outer + } + } + + res[k] = struct{}{} + } + + return res +} + +// Union calculates union of a set with all other sets. +func Union[K cmp.Ordered](sets ...Set[K]) Set[K] { + res := Set[K]{} + + for _, set := range sets { + for k := range set { + res[k] = struct{}{} + } + } + + return res +} + +// Values converts set to a sorted slice. +func Values[K cmp.Ordered](s Set[K]) []K { + keys := maps.Keys(s) + + slices.Sort(keys) + + return keys +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler.go new file mode 100644 index 000000000..3b96e2399 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler.go @@ -0,0 +1,141 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +import ( + "crypto/sha256" + "encoding/hex" + "slices" + + "github.com/siderolabs/gen/maps" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" +) + +// ReconcileStatus builds the machine set status from the resources. +// +//nolint:gocyclo,cyclop +func ReconcileStatus(rc *ReconciliationContext, machineSetStatus *omni.MachineSetStatus) { + spec := machineSetStatus.TypedSpec().Value + + configHashHasher := sha256.New() + + clusterMachineConfigStatuses := rc.GetClusterMachineConfigStatuses() + ids := maps.Keys(clusterMachineConfigStatuses) + + slices.Sort(ids) + + for _, id := range ids { + configStatus := clusterMachineConfigStatuses[id] + + configHashHasher.Write([]byte(configStatus.TypedSpec().Value.ClusterMachineConfigSha256)) + } + + // combined hash of all cluster machine config hashes + spec.ConfigHash = hex.EncodeToString(configHashHasher.Sum(nil)) + + machineSet := rc.GetMachineSet() + machineSetNodes := rc.GetRunningMachineSetNodes() + clusterMachines := rc.GetRunningClusterMachines() + clusterMachineStatuses := rc.GetClusterMachineStatuses() + + helpers.CopyAllLabels(machineSet, machineSetStatus) + + spec.Phase = specs.MachineSetPhase_Running + spec.Error = "" + spec.Machines = &specs.Machines{} + + spec.Machines.Requested = uint32(len(machineSetNodes)) + + // requested machines is max(manuallyAllocatedMachines, machineClassMachineCount) + // if machine class allocation type is not static it falls back to the actual machineSetNodes count + // then we first compare number of machine set nodes against the number of requested machines + // if they match we compare the number of cluster machines against the number of machine set nodes + machineClass := machineSet.TypedSpec().Value.MachineClass + if machineClass != nil && machineClass.AllocationType == specs.MachineSetSpec_MachineClass_Static { + spec.Machines.Requested = machineClass.MachineCount + } + + spec.MachineClass = machineClass + + switch { + case len(machineSetNodes) < int(spec.Machines.Requested): + spec.Phase = specs.MachineSetPhase_ScalingUp + case len(machineSetNodes) > int(spec.Machines.Requested): + spec.Phase = specs.MachineSetPhase_ScalingDown + case len(clusterMachineStatuses) < len(machineSetNodes): + spec.Phase = specs.MachineSetPhase_ScalingUp + case len(clusterMachines) > len(machineSetNodes): + spec.Phase = specs.MachineSetPhase_ScalingDown + } + + _, isControlPlane := machineSet.Metadata().Labels().Get(omni.LabelControlPlaneRole) + + if isControlPlane && len(machineSetNodes) == 0 { + spec.Phase = specs.MachineSetPhase_Failed + spec.Error = "control plane machine set must have at least one node" + } + + for _, clusterMachine := range clusterMachines { + spec.Machines.Total++ + + if clusterMachineStatus := clusterMachineStatuses[clusterMachine.Metadata().ID()]; clusterMachineStatus != nil { + if clusterMachineStatus.TypedSpec().Value.Stage == specs.ClusterMachineStatusSpec_RUNNING && clusterMachineStatus.TypedSpec().Value.Ready { + spec.Machines.Healthy++ + } + + if _, ok := clusterMachineStatus.Metadata().Labels().Get(omni.MachineStatusLabelConnected); ok { + spec.Machines.Connected++ + } + } + } + + spec.Ready = spec.Phase == specs.MachineSetPhase_Running + + if !spec.Ready { + return + } + + if len(rc.GetMachinesToUpdate()) > 0 || len(rc.GetOutdatedMachines()) > 0 { + machineSetStatus.TypedSpec().Value.Phase = specs.MachineSetPhase_Reconfiguring + + spec.Ready = false + + return + } + + for _, machineSetNode := range machineSetNodes { + clusterMachine := clusterMachines[machineSetNode.Metadata().ID()] + clusterMachineStatus := clusterMachineStatuses[machineSetNode.Metadata().ID()] + + if clusterMachine == nil || clusterMachineStatus == nil { + spec.Ready = false + spec.Phase = specs.MachineSetPhase_ScalingUp + + return + } + + clusterMachineStatusSpec := clusterMachineStatus.TypedSpec().Value + + if clusterMachineStatusSpec.Stage != specs.ClusterMachineStatusSpec_RUNNING { + spec.Ready = false + + return + } + + if !clusterMachineStatusSpec.Ready { + spec.Ready = false + + return + } + } + + if spec.Machines.Connected != spec.Machines.Total { + spec.Ready = false + } +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler_test.go new file mode 100644 index 000000000..5f07820e7 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/status_handler_test.go @@ -0,0 +1,388 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "reflect" + "testing" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/siderolabs/gen/xslices" + "github.com/stretchr/testify/require" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" +) + +func newClusterMachineStatus(id string, stage specs.ClusterMachineStatusSpec_Stage, ready, connected bool) *omni.ClusterMachineStatus { + res := omni.NewClusterMachineStatus(resources.DefaultNamespace, id) + res.TypedSpec().Value.Ready = ready + res.TypedSpec().Value.Stage = stage + + if connected { + res.Metadata().Labels().Set(omni.MachineStatusLabelConnected, "") + } + + return res +} + +func newMachineSet(machineCount int) *omni.MachineSet { + res := omni.NewMachineSet(resources.DefaultNamespace, "test") + res.TypedSpec().Value.MachineClass = &specs.MachineSetSpec_MachineClass{ + MachineCount: uint32(machineCount), + } + + return res +} + +//nolint:maintidx +func TestStatusHandler(t *testing.T) { + ms := omni.NewMachineSet("", "") + + var patches []*omni.ConfigPatch + + //nolint:govet + for _, tt := range []struct { + name string + machineSet *omni.MachineSet + machineSetNodes []*omni.MachineSetNode + clusterMachines []*omni.ClusterMachine + clusterMachineStatuses []*omni.ClusterMachineStatus + expectedStatus *specs.MachineSetStatusSpec + }{ + { + name: "running no machines", + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Running, + Ready: true, + Machines: &specs.Machines{}, + ConfigHash: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + }, + }, + { + name: "running 2 machines", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "b"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Running, + Ready: true, + Machines: &specs.Machines{ + Total: 2, + Healthy: 2, + Connected: 2, + Requested: 2, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + { + name: "pending update 2 machines", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + omni.NewClusterMachine(resources.DefaultNamespace, "b"), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Reconfiguring, + Ready: false, + Machines: &specs.Machines{ + Total: 2, + Healthy: 2, + Connected: 2, + Requested: 2, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + { + name: "scaling down", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "b"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_ScalingDown, + Ready: false, + Machines: &specs.Machines{ + Total: 2, + Healthy: 2, + Connected: 2, + Requested: 1, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + { + name: "scaling up", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_ScalingUp, + Ready: false, + Machines: &specs.Machines{ + Total: 1, + Healthy: 1, + Connected: 1, + Requested: 2, + }, + ConfigHash: "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb", + }, + }, + { + name: "running 2 machines, not ready", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "b"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, false, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Running, + Ready: false, + Machines: &specs.Machines{ + Total: 2, + Healthy: 1, + Connected: 2, + Requested: 2, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + { + name: "running 2 machines, not connected", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "b"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, false), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Running, + Ready: false, + Machines: &specs.Machines{ + Total: 2, + Healthy: 2, + Connected: 1, + Requested: 2, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + { + name: "scaling down and scaling up", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_ScalingUp, + Ready: false, + Machines: &specs.Machines{ + Total: 1, + Healthy: 1, + Connected: 1, + Requested: 1, + }, + ConfigHash: "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb", + }, + }, + { + name: "scaling up machine class", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + }, + machineSet: newMachineSet(4), + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_ScalingUp, + Ready: false, + Machines: &specs.Machines{ + Total: 1, + Healthy: 1, + Connected: 1, + Requested: 4, + }, + MachineClass: &specs.MachineSetSpec_MachineClass{ + MachineCount: 4, + }, + ConfigHash: "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb", + }, + }, + { + name: "scaling down machine class", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + }, + machineSet: newMachineSet(0), + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_ScalingDown, + Ready: false, + Machines: &specs.Machines{ + Total: 1, + Healthy: 1, + Connected: 1, + Requested: 0, + }, + MachineClass: &specs.MachineSetSpec_MachineClass{ + MachineCount: 0, + }, + ConfigHash: "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb", + }, + }, + { + name: "unready 2 machines", + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", ms), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", ms), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "a"), patches...), + withUpdateInputVersions(omni.NewClusterMachine(resources.DefaultNamespace, "b"), patches...), + }, + clusterMachineStatuses: []*omni.ClusterMachineStatus{ + newClusterMachineStatus("a", specs.ClusterMachineStatusSpec_BOOTING, true, true), + newClusterMachineStatus("b", specs.ClusterMachineStatusSpec_RUNNING, true, true), + }, + expectedStatus: &specs.MachineSetStatusSpec{ + Phase: specs.MachineSetPhase_Running, + Ready: false, + Machines: &specs.Machines{ + Total: 2, + Healthy: 1, + Connected: 2, + Requested: 2, + }, + ConfigHash: "fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + machineSet := tt.machineSet + if machineSet == nil { + machineSet = omni.NewMachineSet(resources.DefaultNamespace, "test") + } + + require := require.New(t) + + clusterMachineConfigStatuses := make([]*omni.ClusterMachineConfigStatus, 0, len(tt.clusterMachines)) + clusterMachineConfigPatches := make([]*omni.ClusterMachineConfigPatches, 0, len(tt.clusterMachines)) + + for _, cm := range tt.clusterMachines { + version := resource.VersionUndefined.Next() + + cm.Metadata().SetVersion(version) + + clusterMachineConfigStatuses = append(clusterMachineConfigStatuses, withSpecSetter( + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, cm.Metadata().ID()), version), + func(r *omni.ClusterMachineConfigStatus) { + r.TypedSpec().Value.ClusterMachineConfigSha256 = cm.Metadata().ID() + }, + )) + + clusterMachineConfigPatches = append(clusterMachineConfigPatches, omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, cm.Metadata().ID())) + } + + rc, err := machineset.NewReconciliationContext( + omni.NewCluster(resources.DefaultNamespace, "test"), + machineSet, + newHealthyLB("test"), + &fakePatchHelper{}, + tt.machineSetNodes, + tt.clusterMachines, + clusterMachineConfigStatuses, + clusterMachineConfigPatches, + tt.clusterMachineStatuses, + ) + + require.NoError(err) + + machineSetStatus := omni.NewMachineSetStatus(resources.DefaultNamespace, "doesn't matter") + + machineset.ReconcileStatus(rc, machineSetStatus) + + require.True(tt.expectedStatus.EqualVT(machineSetStatus.TypedSpec().Value), "machine set status doesn't match %s", cmp.Diff( + tt.expectedStatus, + machineSetStatus.TypedSpec().Value, + IgnoreUnexported(tt.expectedStatus, &specs.Machines{}, &specs.MachineSetSpec_MachineClass{}), + )) + }) + } +} + +func IgnoreUnexported(vals ...any) cmp.Option { + return cmpopts.IgnoreUnexported(xslices.Map(vals, func(v any) any { + val := reflect.ValueOf(v) + if val.Kind() == reflect.Ptr { + val = val.Elem() + } + + return val.Interface() + })...) +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/tearing_down_handler.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/tearing_down_handler.go new file mode 100644 index 000000000..abe2221c6 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/tearing_down_handler.go @@ -0,0 +1,21 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +// ReconcileTearingDown removes all machines from the machine set without any checks. +func ReconcileTearingDown(rc *ReconciliationContext) []Operation { + operations := make([]Operation, 0, len(rc.GetClusterMachines())) + + for _, id := range rc.GetMachinesToDestroy() { + operations = append(operations, &Destroy{ID: id}) + } + + for _, id := range rc.GetMachinesToTeardown() { + operations = append(operations, &Teardown{ID: id}) + } + + return operations +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler.go new file mode 100644 index 000000000..efa3f1587 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler.go @@ -0,0 +1,40 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset + +// ReconcileWorkers gets the reconciliation context and produces the list of changes to apply on the machine set. +func ReconcileWorkers(rc *ReconciliationContext) []Operation { + quota := rc.CalculateQuota() + + toCreate := rc.GetMachinesToCreate() + toTeardown := rc.GetMachinesToTeardown() + toUpdate := rc.GetMachinesToUpdate() + toDestroy := rc.GetMachinesToDestroy() + + operations := make([]Operation, 0, len(toCreate)+len(toTeardown)+len(toUpdate)+len(toDestroy)) + + for _, id := range toDestroy { + operations = append(operations, &Destroy{ID: id}) + } + + for _, id := range toCreate { + operations = append(operations, &Create{ID: id}) + } + + if !rc.LBHealthy() { + return operations + } + + for _, id := range toTeardown { + operations = append(operations, &Teardown{ID: id, Quota: "a}) + } + + for _, id := range toUpdate { + operations = append(operations, &Update{ID: id, Quota: "a}) + } + + return operations +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler_test.go b/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler_test.go new file mode 100644 index 000000000..1e8a00e40 --- /dev/null +++ b/internal/backend/runtime/omni/controllers/omni/internal/machineset/workers_handler_test.go @@ -0,0 +1,224 @@ +// Copyright (c) 2024 Sidero Labs, Inc. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. + +package machineset_test + +import ( + "testing" + + "github.com/cosi-project/runtime/pkg/resource" + "github.com/stretchr/testify/require" + + "github.com/siderolabs/omni/client/api/omni/specs" + "github.com/siderolabs/omni/client/pkg/omni/resources" + "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" +) + +func TestWorkersHandler(t *testing.T) { + machineSet := omni.NewMachineSet(resources.DefaultNamespace, "test") + + version := resource.VersionUndefined.Next() + + //nolint:govet + for _, tt := range []struct { + name string + machineSet *specs.MachineSetSpec + machineSetNodes []*omni.MachineSetNode + clusterMachines []*omni.ClusterMachine + clusterMachineConfigStatuses []*omni.ClusterMachineConfigStatus + clusterMachineConfigPatches []*omni.ClusterMachineConfigPatches + + pendingConfigPatches map[resource.ID][]*omni.ConfigPatch + + expectRequeue bool + expectOperations []machineset.Operation + }{ + { + name: "create nodes", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "c", machineSet), + }, + expectOperations: []machineset.Operation{ + &machineset.Create{ID: "a"}, + &machineset.Create{ID: "b"}, + &machineset.Create{ID: "c"}, + }, + }, + { + name: "create nodes when scaling down", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "c", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version)), + tearingDownNoFinalizers(omni.NewClusterMachine(resources.DefaultNamespace, "b")), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + expectOperations: []machineset.Operation{ + &machineset.Destroy{ID: "b"}, + &machineset.Create{ID: "c"}, + }, + }, + { + name: "destroy multiple", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version)), + tearingDownNoFinalizers(omni.NewClusterMachine(resources.DefaultNamespace, "b")), + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "c"), version)), + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "d"), version)), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + expectOperations: []machineset.Operation{ + &machineset.Destroy{ID: "b"}, + &machineset.Teardown{ID: "c"}, + &machineset.Teardown{ID: "d"}, + }, + }, + { + name: "destroy, create and update at the same time", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "b", machineSet), + omni.NewMachineSetNode(resources.DefaultNamespace, "c", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version)), + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "c"), version)), + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "d"), version)), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "c"), + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "d"), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "d"), version), + }, + pendingConfigPatches: map[resource.ID][]*omni.ConfigPatch{ + "a": { + omni.NewConfigPatch(resources.DefaultNamespace, "1"), + }, + }, + expectOperations: []machineset.Operation{ + &machineset.Create{ + ID: "b", + }, + &machineset.Teardown{ + ID: "d", + }, + &machineset.Update{ + ID: "a", + }, + }, + }, + { + name: "update a machine", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + omni.NewClusterMachine(resources.DefaultNamespace, "a"), + }, + expectOperations: []machineset.Operation{ + &machineset.Update{ + ID: "a", + }, + }, + }, + { + name: "no actions", + machineSet: &specs.MachineSetSpec{}, + machineSetNodes: []*omni.MachineSetNode{ + omni.NewMachineSetNode(resources.DefaultNamespace, "a", machineSet), + }, + clusterMachines: []*omni.ClusterMachine{ + withUpdateInputVersions[*omni.ClusterMachine, *omni.ConfigPatch](withVersion(omni.NewClusterMachine(resources.DefaultNamespace, "a"), version)), + }, + clusterMachineConfigPatches: []*omni.ClusterMachineConfigPatches{ + omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, "a"), + }, + clusterMachineConfigStatuses: []*omni.ClusterMachineConfigStatus{ + withClusterMachineVersionSetter(omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, "a"), version), + }, + expectOperations: []machineset.Operation{}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + require := require.New(t) + + machineSet.TypedSpec().Value = tt.machineSet + + cluster := omni.NewCluster(resources.DefaultNamespace, tt.name) + cluster.TypedSpec().Value.TalosVersion = "v1.6.0" + cluster.TypedSpec().Value.KubernetesVersion = "v1.29.0" + + rc, err := machineset.NewReconciliationContext( + cluster, + machineSet, + newHealthyLB(cluster.Metadata().ID()), + &fakePatchHelper{ + tt.pendingConfigPatches, + }, + tt.machineSetNodes, + tt.clusterMachines, + tt.clusterMachineConfigStatuses, + tt.clusterMachineConfigPatches, + nil, + ) + + require.NoError(err) + + operations := machineset.ReconcileWorkers(rc) + + require.Equal(len(tt.expectOperations), len(operations), "%#v", operations) + + for i, op := range operations { + expected := tt.expectOperations[i] + + switch value := op.(type) { + case *machineset.Create: + create, ok := expected.(*machineset.Create) + + require.True(ok, "the operation at %d is not create", i) + require.Equal(create.ID, value.ID) + case *machineset.Update: + update, ok := expected.(*machineset.Update) + + require.True(ok, "the operation at %d is not update", i) + require.Equal(update.ID, value.ID) + case *machineset.Teardown: + destroy, ok := expected.(*machineset.Teardown) + + require.True(ok, "the operation at %d is not destroy", i) + require.Equal(destroy.ID, value.ID) + } + } + }) + } +} diff --git a/internal/backend/runtime/omni/controllers/omni/internal/task/clustermachine/identity.go b/internal/backend/runtime/omni/controllers/omni/internal/task/clustermachine/identity.go index f372089f8..ec4bf56ab 100644 --- a/internal/backend/runtime/omni/controllers/omni/internal/task/clustermachine/identity.go +++ b/internal/backend/runtime/omni/controllers/omni/internal/task/clustermachine/identity.go @@ -43,6 +43,7 @@ import ( type IdentityCollectorTaskSpec struct { config *clientconfig.Config configVersion resource.Version + machineSetName string clusterName string managementAddress string id resource.ID @@ -53,7 +54,9 @@ type IdentityCollectorTaskSpec struct { type IdentityCollectorChan chan<- *omni.ClusterMachineIdentity // NewIdentityCollectorTaskSpec creates new ClusterMachineCollector. -func NewIdentityCollectorTaskSpec(id resource.ID, config *clientconfig.Config, configVersion resource.Version, address string, isControlPlane bool, clusterName string) IdentityCollectorTaskSpec { +func NewIdentityCollectorTaskSpec(id resource.ID, config *clientconfig.Config, configVersion resource.Version, address string, isControlPlane bool, + clusterName, machineSetName string, +) IdentityCollectorTaskSpec { return IdentityCollectorTaskSpec{ id: id, config: config, @@ -61,6 +64,7 @@ func NewIdentityCollectorTaskSpec(id resource.ID, config *clientconfig.Config, c managementAddress: address, isControlPlane: isControlPlane, clusterName: clusterName, + machineSetName: machineSetName, } } @@ -110,6 +114,10 @@ func (spec IdentityCollectorTaskSpec) RunTask(ctx context.Context, _ *zap.Logger clusterMachineIdentity.Metadata().Labels().Set(omni.LabelWorkerRole, "") } + if spec.machineSetName != "" { + clusterMachineIdentity.Metadata().Labels().Set(omni.LabelMachineSet, spec.machineSetName) + } + client, err := spec.getClient(ctx) if err != nil { return err diff --git a/internal/backend/runtime/omni/controllers/omni/kubeconfig.go b/internal/backend/runtime/omni/controllers/omni/kubeconfig.go index 650cf8802..e0a5a97fa 100644 --- a/internal/backend/runtime/omni/controllers/omni/kubeconfig.go +++ b/internal/backend/runtime/omni/controllers/omni/kubeconfig.go @@ -24,6 +24,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/client/pkg/omni/resources/system" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/pkg/certs" ) @@ -70,7 +71,7 @@ func NewKubeconfigController(certificateValidity time.Duration) *KubeconfigContr } // should always call UpdateInputsVersions to update the annotations, due to short-circuiting - if !UpdateInputsVersions[resource.Resource](kubeconfig, secrets, lbConfig) && !staleCertificate { + if !helpers.UpdateInputsVersions[resource.Resource](kubeconfig, secrets, lbConfig) && !staleCertificate { return nil } diff --git a/internal/backend/runtime/omni/controllers/omni/kubernetes_upgrade_manifest_status.go b/internal/backend/runtime/omni/controllers/omni/kubernetes_upgrade_manifest_status.go index 8139c5e35..3e03b10c6 100644 --- a/internal/backend/runtime/omni/controllers/omni/kubernetes_upgrade_manifest_status.go +++ b/internal/backend/runtime/omni/controllers/omni/kubernetes_upgrade_manifest_status.go @@ -29,6 +29,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/internal/backend/runtime" "github.com/siderolabs/omni/internal/backend/runtime/kubernetes" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" "github.com/siderolabs/omni/internal/backend/runtime/talos" ) @@ -128,7 +129,7 @@ func NewKubernetesUpgradeManifestStatusController() *KubernetesUpgradeManifestSt // - kubernetes upgrade status (Kubernetes version) // - talos upgrade status (Talos version) // - controlplane machine set aggregated config hash (controlplane ConfigPatches) - if !UpdateInputsAnnotation( + if !helpers.UpdateInputsAnnotation( manifestStatus, k8sUpgradeStatus.Metadata().Version().String(), talosUpgradeStatus.Metadata().Version().String(), diff --git a/internal/backend/runtime/omni/controllers/omni/machine_set_destroy_status.go b/internal/backend/runtime/omni/controllers/omni/machine_set_destroy_status.go index baa1c4187..e20a49aa7 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_set_destroy_status.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_set_destroy_status.go @@ -95,7 +95,7 @@ func (ctrl *MachineSetDestroyStatusController) Run(ctx context.Context, r contro continue } - return fmt.Errorf("failed to destroy cluster: %w", err) + return fmt.Errorf("failed to destroy machine set: %w", err) } } } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_set_node.go b/internal/backend/runtime/omni/controllers/omni/machine_set_node.go index 07f4785df..984e29976 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_set_node.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_set_node.go @@ -15,6 +15,7 @@ import ( "github.com/cosi-project/runtime/pkg/resource" "github.com/cosi-project/runtime/pkg/safe" "github.com/cosi-project/runtime/pkg/state" + "github.com/siderolabs/gen/xslices" "go.uber.org/zap" "github.com/siderolabs/omni/client/api/omni/specs" @@ -56,6 +57,11 @@ func (ctrl *MachineSetNodeController) Inputs() []controller.Input { Type: omni.MachineClassType, Kind: controller.InputWeak, }, + { + Namespace: resources.DefaultNamespace, + Type: omni.MachineSetNodeType, + Kind: controller.InputDestroyReady, + }, } } @@ -250,7 +256,7 @@ func (ctrl *MachineSetNodeController) deleteNodes( r controller.Runtime, machineSetNodes safe.List[*omni.MachineSetNode], machineStatuses map[string]*omni.MachineStatus, - count int, + machinesToDestroyCount int, ) error { usedMachineSetNodes, err := safe.Map(machineSetNodes, func(m *omni.MachineSetNode) (*omni.MachineSetNode, error) { return m, nil @@ -259,14 +265,49 @@ func (ctrl *MachineSetNodeController) deleteNodes( return err } + // filter only running used machines + xslices.FilterInPlace(usedMachineSetNodes, func(r *omni.MachineSetNode) bool { + return r.Metadata().Phase() == resource.PhaseRunning + }) + slices.SortStableFunc(usedMachineSetNodes, getSortFunction(machineStatuses)) - for i := 0; i < count; i++ { - if i >= len(usedMachineSetNodes) { + // destroy all machines which are currently in tearing down phase and have no finalizers + if err = machineSetNodes.ForEachErr(func(machineSetNode *omni.MachineSetNode) error { + if machineSetNode.Metadata().Phase() == resource.PhaseRunning { + return nil + } + + machinesToDestroyCount-- + if machineSetNode.Metadata().Finalizers().Empty() { + return r.Destroy(ctx, machineSetNode.Metadata()) + } + + return nil + }); err != nil { + return err + } + + iterations := len(usedMachineSetNodes) + if machinesToDestroyCount < iterations { + iterations = machinesToDestroyCount + } + + for i := 0; i < iterations; i++ { + var ( + ready bool + err error + ) + + if ready, err = r.Teardown(ctx, usedMachineSetNodes[i].Metadata()); err != nil { + return err + } + + if !ready { return nil } - if err := r.Destroy(ctx, usedMachineSetNodes[i].Metadata()); err != nil { + if err = r.Destroy(ctx, usedMachineSetNodes[i].Metadata()); err != nil { return err } } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_set_status.go b/internal/backend/runtime/omni/controllers/omni/machine_set_status.go index f16227f02..bcc52b7f9 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_set_status.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_set_status.go @@ -7,1168 +7,249 @@ package omni import ( "context" - "crypto/sha256" - "crypto/tls" - "encoding/hex" - "fmt" - "net" - "net/url" "time" "github.com/cosi-project/runtime/pkg/controller" + "github.com/cosi-project/runtime/pkg/controller/generic/qtransform" "github.com/cosi-project/runtime/pkg/resource" "github.com/cosi-project/runtime/pkg/safe" "github.com/cosi-project/runtime/pkg/state" - "github.com/hashicorp/go-multierror" - serverpb "github.com/siderolabs/discovery-api/api/v1alpha1/server/pb" - "github.com/siderolabs/gen/pair" "github.com/siderolabs/gen/xslices" - "github.com/siderolabs/talos/pkg/machinery/constants" "go.uber.org/zap" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" - "github.com/siderolabs/omni/internal/backend/runtime" - "github.com/siderolabs/omni/internal/backend/runtime/kubernetes" - "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/configpatch" - "github.com/siderolabs/omni/internal/backend/runtime/omni/pkg/check" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/machineset" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" ) -type machineWithPatches = pair.Pair[*omni.ClusterMachine, *omni.ClusterMachineConfigPatches] - // MachineSetStatusController manages MachineSetStatus resource lifecycle. // // MachineSetStatusController creates and deletes cluster machines, handles rolling updates. -type MachineSetStatusController struct { - discoveryClient serverpb.ClusterClient -} - -// Name implements controller.Controller interface. -func (ctrl *MachineSetStatusController) Name() string { - return "MachineSetStatusController" -} - -// Inputs implements controller.Controller interface. -func (ctrl *MachineSetStatusController) Inputs() []controller.Input { - return []controller.Input{ - { - Namespace: resources.DefaultNamespace, - Type: omni.MachineSetType, - Kind: controller.InputStrong, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.MachineSetNodeType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterMachineStatusType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterMachineType, - Kind: controller.InputDestroyReady, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ControlPlaneStatusType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ConfigPatchType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterMachineConfigStatusType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterSecretsType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterMachineIdentityType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.LoadBalancerStatusType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.ClusterType, - Kind: controller.InputWeak, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.MachineType, - Kind: controller.InputStrong, - }, - { - Namespace: resources.DefaultNamespace, - Type: omni.TalosConfigType, - Kind: controller.InputWeak, - }, - } -} - -// Outputs implements controller.Controller interface. -func (ctrl *MachineSetStatusController) Outputs() []controller.Output { - return []controller.Output{ - { - Type: omni.MachineSetStatusType, - Kind: controller.OutputExclusive, - }, - { - Type: omni.ClusterMachineType, - Kind: controller.OutputShared, - }, - { - Type: omni.ClusterMachineConfigPatchesType, - Kind: controller.OutputExclusive, - }, - } -} - -// Run implements controller.Controller interface. -// -//nolint:gocognit,gocyclo,cyclop -func (ctrl *MachineSetStatusController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error { - conn, err := ctrl.createDiscoveryClient(ctx) - if err != nil { - return fmt.Errorf("error creating discovery client: %w", err) - } - - defer func() { - if err = conn.Close(); err != nil { - logger.Error("error closing discovery client connection", zap.Error(err)) - } - }() - - for { - select { - case <-ctx.Done(): - return nil - case <-r.EventCh(): - } - - list, err := safe.ReaderListAll[*omni.MachineSet](ctx, r) - if err != nil { - return fmt.Errorf("error listing machine sets: %w", err) - } - - allClusterMachines, err := safe.ReaderListAll[*omni.ClusterMachine](ctx, r) - if err != nil { - return fmt.Errorf("error listing all cluster machines: %w", err) - } - - configPatchHelper, err := configpatch.NewHelper(ctx, r) - if err != nil { - return fmt.Errorf("error creating config patch helper: %w", err) - } - - tracker := trackResource(r, resources.DefaultNamespace, omni.MachineSetStatusType) - - var multiErr error - - for iter := list.Iterator(); iter.Next(); { - // process a single machine set capturing the error - if err = func(iter safe.ListIterator[*omni.MachineSet], logger *zap.Logger) error { - machineSet := iter.Value() - machineSetStatus := omni.NewMachineSetStatus(resources.DefaultNamespace, machineSet.Metadata().ID()) - - tracker.keep(machineSet) - - if machineSet.Metadata().Phase() != resource.PhaseTearingDown { - if err = r.AddFinalizer(ctx, machineSet.Metadata(), ctrl.Name()); err != nil { - return fmt.Errorf("error adding finalizer to machine set %q: %w", machineSet.Metadata().ID(), err) - } - } - - clusterName, ok := machineSet.Metadata().Labels().Get(omni.LabelCluster) - if ok { - logger = logger.With(zap.String("cluster", clusterName)) - } - - logger = logger.With(zap.String("machineset", machineSet.Metadata().ID())) - - setErr := safe.WriterModify(ctx, r, machineSetStatus, func(status *omni.MachineSetStatus) error { - CopyLabels(machineSet, machineSetStatus, omni.LabelCluster, omni.LabelControlPlaneRole, omni.LabelWorkerRole) - - if machineSet.Metadata().Phase() == resource.PhaseTearingDown { - status.TypedSpec().Value.Phase = specs.MachineSetPhase_Destroying - status.TypedSpec().Value.Ready = false - status.TypedSpec().Value.Machines = &specs.Machines{ - Total: 0, - Healthy: 0, - } - - machineErr := ctrl.destroyMachinesNoWait(ctx, r, logger, machineSet, allClusterMachines) - if machineErr != nil { - return fmt.Errorf( - "error destroying machines (no wait), phase %q, ready %t: %w", - status.TypedSpec().Value.GetPhase(), - status.TypedSpec().Value.GetReady(), - err, - ) - } - - return nil - } - - var machines safe.List[*omni.MachineSetNode] - - machines, err = safe.ReaderListAll[*omni.MachineSetNode]( - ctx, - r, - state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, machineSet.Metadata().ID())), - ) - if err != nil { - return fmt.Errorf( - "error listing machine set nodes, phase %q, ready %t: %w", - status.TypedSpec().Value.GetPhase(), - status.TypedSpec().Value.GetReady(), - err, - ) - } - - if err = ctrl.reconcileMachines(ctx, r, logger, machines, machineSet, status, allClusterMachines, configPatchHelper); err != nil { - return fmt.Errorf( - "error reconciling machines, phase %q, ready %t: %w", - status.TypedSpec().Value.GetPhase(), - status.TypedSpec().Value.GetReady(), - err, - ) - } - - return nil - }) - if setErr != nil { - return fmt.Errorf("error modifying machine set %q: %w", machineSet.Metadata().ID(), err) - } - - return nil - }(iter, logger); err != nil { - multiErr = multierror.Append(multiErr, fmt.Errorf("reconcile of machine set %q failed: %w", iter.Value().Metadata().ID(), err)) - } - } - - if multiErr != nil { - return multiErr - } - - if err = tracker.cleanup(ctx); err != nil { - return err - } - - r.ResetRestartBackoff() - } -} - -//nolint:gocognit,gocyclo,cyclop -func (ctrl *MachineSetStatusController) updateStatus( - ctx context.Context, - r controller.Runtime, - machineSetNodes safe.List[*omni.MachineSetNode], - clusterMachines map[resource.ID]*omni.ClusterMachine, - machineSet *omni.MachineSet, - machineSetStatus *omni.MachineSetStatus, -) error { - spec := machineSetStatus.TypedSpec().Value - - list, err := safe.ReaderListAll[*omni.ClusterMachineStatus]( - ctx, - r, - state.WithLabelQuery(resource.LabelEqual(omni.LabelMachineSet, machineSet.Metadata().ID())), - ) - if err != nil { - return fmt.Errorf("error listing cluster machines: %w", err) - } - - spec.Phase = specs.MachineSetPhase_Running - spec.Error = "" - spec.Machines = &specs.Machines{} - - spec.Machines.Requested = uint32(machineSetNodes.Len()) - - // requested machines is max(manuallyAllocatedMachines, machineClassMachineCount) - // if machine class allocation type is not static it falls back to the actual machineSetNodes count - // then we first compare number of machine set nodes against the number of requested machines - // if they match we compare the number of cluster machines against the number of machine set nodes - machineClass := machineSet.TypedSpec().Value.MachineClass - if machineClass != nil && machineClass.AllocationType == specs.MachineSetSpec_MachineClass_Static { - spec.Machines.Requested = machineClass.MachineCount - } - - spec.MachineClass = machineClass - - switch { - case machineSetNodes.Len() < int(spec.Machines.Requested): - spec.Phase = specs.MachineSetPhase_ScalingUp - case machineSetNodes.Len() > int(spec.Machines.Requested): - spec.Phase = specs.MachineSetPhase_ScalingDown - case list.Len() < machineSetNodes.Len(): - spec.Phase = specs.MachineSetPhase_ScalingUp - case len(clusterMachines) > machineSetNodes.Len(): - spec.Phase = specs.MachineSetPhase_ScalingDown - } - - if isControlPlane(machineSet) && machineSetNodes.Len() == 0 { - spec.Phase = specs.MachineSetPhase_Failed - spec.Error = "control plane machine set must have at least one node" - } - - clusterMachineStatuses := map[resource.ID]*omni.ClusterMachineStatus{} - - for iter := list.Iterator(); iter.Next(); { - clusterMachineStatus := iter.Value() - - clusterMachineStatuses[clusterMachineStatus.Metadata().ID()] = clusterMachineStatus - - if spec.Phase == specs.MachineSetPhase_Running { - if clusterMachineStatus.TypedSpec().Value.GetConfigApplyStatus() == specs.ConfigApplyStatus_PENDING { - spec.Phase = specs.MachineSetPhase_Reconfiguring - } - } - } - - for _, clusterMachine := range clusterMachines { - spec.Machines.Total++ - - if clusterMachineStatus := clusterMachineStatuses[clusterMachine.Metadata().ID()]; clusterMachineStatus != nil { - if clusterMachineStatus.TypedSpec().Value.Stage == specs.ClusterMachineStatusSpec_RUNNING && clusterMachineStatus.TypedSpec().Value.Ready { - spec.Machines.Healthy++ - } - - if _, ok := clusterMachineStatus.Metadata().Labels().Get(omni.MachineStatusLabelConnected); ok { - spec.Machines.Connected++ - } - } - } - - spec.Ready = spec.Phase == specs.MachineSetPhase_Running - - if !spec.Ready { - return nil - } - - configHashHasher := sha256.New() - - for iter := machineSetNodes.Iterator(); iter.Next(); { - machineSetNode := iter.Value() - - clusterMachine := clusterMachines[machineSetNode.Metadata().ID()] - clusterMachineStatus := clusterMachineStatuses[machineSetNode.Metadata().ID()] - - if clusterMachine == nil || clusterMachineStatus == nil { - spec.Ready = false - spec.Phase = specs.MachineSetPhase_ScalingUp - - return nil - } - - clusterMachineStatusSpec := clusterMachineStatus.TypedSpec().Value - - if clusterMachineStatusSpec.Stage != specs.ClusterMachineStatusSpec_RUNNING { - spec.Ready = false - - return nil - } - - if !clusterMachineStatusSpec.Ready { - spec.Ready = false - - return nil - } - - configStatus, err := safe.ReaderGet[*omni.ClusterMachineConfigStatus]( - ctx, - r, - omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, machineSetNode.Metadata().ID()).Metadata(), - ) - if err != nil && !state.IsNotFoundError(err) { - return fmt.Errorf( - "error getting cluster machine config status for node %q: %w", - machineSetNode.Metadata().ID(), - err, - ) - } - - if configStatus != nil { - configHashHasher.Write([]byte(configStatus.TypedSpec().Value.ClusterMachineConfigSha256)) - } - - if configStatus == nil || isOutdated(clusterMachine, configStatus) { - spec.Ready = false - - return nil - } - } - - // combined hash of all cluster machine config hashes - spec.ConfigHash = hex.EncodeToString(configHashHasher.Sum(nil)) - - return nil -} - -func (ctrl *MachineSetStatusController) reconcileMachines( - ctx context.Context, - r controller.Runtime, - logger *zap.Logger, - expectedNodes safe.List[*omni.MachineSetNode], - machineSet *omni.MachineSet, - machineSetStatus *omni.MachineSetStatus, - allClusterMachines safe.List[*omni.ClusterMachine], - configPatchHelper *configpatch.Helper, -) error { - clusterMachines := allClusterMachines.FilterLabelQuery(resource.LabelEqual(omni.LabelMachineSet, machineSet.Metadata().ID())) - - expectedNodesMap := map[resource.ID]*omni.MachineSetNode{} - - expectedNodes.ForEach(func(node *omni.MachineSetNode) { expectedNodesMap[node.Metadata().ID()] = node }) - - clusterName, ok := machineSet.Metadata().Labels().Get(omni.LabelCluster) - if !ok { - return fmt.Errorf("failed to determine machine set %q cluster", machineSet.Metadata().ID()) - } - - cluster, err := safe.ReaderGet[*omni.Cluster]( - ctx, - r, - omni.NewCluster(resources.DefaultNamespace, clusterName).Metadata(), - ) - if err != nil { - if state.IsNotFoundError(err) { - logger.Info("cluster doesn't exist: skip machines reconcile call") - - return nil - } - - return fmt.Errorf("failed to get cluster spec %q: %w", clusterName, err) - } - - var tearingDownMachines []*omni.ClusterMachine - - currentClusterMachines := map[resource.ID]*omni.ClusterMachine{} - - clusterMachines.ForEach(func(clusterMachine *omni.ClusterMachine) { - currentClusterMachines[clusterMachine.Metadata().ID()] = clusterMachine - - if clusterMachine.Metadata().Phase() == resource.PhaseTearingDown { - tearingDownMachines = append(tearingDownMachines, clusterMachine) - } - }) - - if err = ctrl.updateStatus(ctx, r, expectedNodes, currentClusterMachines, machineSet, machineSetStatus); err != nil { - return fmt.Errorf("failed to update machine set status: %w", err) - } - - createMachines, err := ctrl.getMachinesToCreate(cluster, machineSet, currentClusterMachines, expectedNodes, configPatchHelper) - if err != nil { - return fmt.Errorf("failed to get machines to create: %w", err) - } - - if len(createMachines) != 0 { - err = ctrl.updateMachines(ctx, r, logger, createMachines, true) - if err != nil { - return fmt.Errorf("failed to create machines: %w", err) - } +type MachineSetStatusController = qtransform.QController[*omni.MachineSet, *omni.MachineSetStatus] - return nil - } +const requeueInterval = time.Second * 30 - loadbalancerInfo, err := safe.ReaderGet[*omni.LoadBalancerStatus]( - ctx, - r, - omni.NewLoadBalancerStatus(resources.DefaultNamespace, clusterName).Metadata(), - ) - if err != nil { - if state.IsNotFoundError(err) { - logger.Info("load balancer status is unknown: skip machines update/destroy") - - return nil +// NewMachineSetStatusController creates new MachineSetStatusController. +func NewMachineSetStatusController() *MachineSetStatusController { + mapMachineIDToMachineSet := func(ctx context.Context, r controller.QRuntime, res resource.Resource, label string) ([]resource.Pointer, error) { + id, ok := res.Metadata().Labels().Get(label) + if !ok { + return nil, nil } - return fmt.Errorf("failed to get load balancer status: %w", err) - } - - // skip the rest if found any machine which is being torn down - if len(tearingDownMachines) > 0 { - err = ctrl.destroyMachines(ctx, r, logger, machineSet, tearingDownMachines) + input, err := safe.ReaderGetByID[*omni.ClusterMachine](ctx, r, id) if err != nil { - return fmt.Errorf("failed to destroy machines: %w", err) - } - - return nil - } - - updateMachines, err := ctrl.getMachinesToUpdate(ctx, r, cluster, machineSet, currentClusterMachines, expectedNodes, configPatchHelper) - if err != nil { - return fmt.Errorf("failed to get machines to update: %w", err) - } - - if len(updateMachines) > 0 { - err = ctrl.updateMachines(ctx, r, logger, updateMachines, false) - if err != nil { - return fmt.Errorf("failed to update machines: %w", err) - } - - return nil - } - - // skip the rest if Kubernetes is not up - if !loadbalancerInfo.TypedSpec().Value.Healthy { - logger.Info("loadbalancer is not healthy: skip destroy flow") - - return nil - } - - destroyMachines, err := ctrl.getMachinesToDestroy(ctx, r, machineSet, clusterMachines, expectedNodesMap) - if err != nil { - return fmt.Errorf("failed to get machines to destroy: %w", err) - } - - if len(destroyMachines) == 0 { - return nil - } - - err = ctrl.destroyMachines(ctx, r, logger, machineSet, destroyMachines) - if err != nil { - return fmt.Errorf("failed to destroy machines: %w", err) - } - - return nil -} - -func (ctrl *MachineSetStatusController) getMachinesToCreate( - cluster *omni.Cluster, - machineSet *omni.MachineSet, - currentClusterMachines map[resource.ID]*omni.ClusterMachine, - expectedMachines safe.List[*omni.MachineSetNode], - configPatchHelper *configpatch.Helper, -) ([]machineWithPatches, error) { - var clusterMachines []machineWithPatches - - for iter := expectedMachines.Iterator(); iter.Next(); { - machineSetNode := iter.Value() - - if _, ok := currentClusterMachines[machineSetNode.Metadata().ID()]; !ok { - clusterMachine := omni.NewClusterMachine(resources.DefaultNamespace, machineSetNode.Metadata().ID()) - clusterMachineConfigPatches := omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, machineSetNode.Metadata().ID()) - - item := pair.MakePair(clusterMachine, clusterMachineConfigPatches) - - CopyLabels(machineSet, clusterMachineConfigPatches, omni.LabelCluster, omni.LabelWorkerRole, omni.LabelControlPlaneRole) - CopyLabels(machineSet, clusterMachine, omni.LabelCluster, omni.LabelWorkerRole, omni.LabelControlPlaneRole) - - clusterMachine.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) - clusterMachineConfigPatches.Metadata().Labels().Set(omni.LabelMachineSet, machineSet.Metadata().ID()) - - _, err := ctrl.updateClusterMachine(cluster, item, machineSet, configPatchHelper) - if err != nil { - return nil, fmt.Errorf("failed to update cluster machine: %w", err) - } - - clusterMachines = append(clusterMachines, item) - } - } - - return clusterMachines, nil -} - -//nolint:gocognit -func (ctrl *MachineSetStatusController) getMachinesToUpdate( - ctx context.Context, - r controller.Runtime, - cluster *omni.Cluster, - machineSet *omni.MachineSet, - currentClusterMachines map[resource.ID]*omni.ClusterMachine, - expectedMachines safe.List[*omni.MachineSetNode], - configPatchHelper *configpatch.Helper, -) ([]machineWithPatches, error) { - var outdatedMachines int - - configStatuses := map[resource.ID]*omni.ClusterMachineConfigStatus{} - - if err := expectedMachines.ForEachErr(func(machineSetNode *omni.MachineSetNode) error { - configStatus, err := safe.ReaderGet[*omni.ClusterMachineConfigStatus]( - ctx, - r, - omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, machineSetNode.Metadata().ID()).Metadata(), - ) - if err != nil { - outdatedMachines++ - if state.IsNotFoundError(err) { - return nil + return nil, nil } - - return fmt.Errorf( - "failed to get cluster machine node %q config status: %w", - machineSetNode.Metadata().ID(), - err, - ) - } - - configStatuses[configStatus.Metadata().ID()] = configStatus - - clusterMachine := currentClusterMachines[configStatus.Metadata().ID()] - - if isOutdated(clusterMachine, configStatus) { - outdatedMachines++ } - return nil - }); err != nil { - return nil, err - } - - maxParallelism := 0 - if machineSet.TypedSpec().Value.GetUpdateStrategy() == specs.MachineSetSpec_Rolling { - maxParallelism = max(1, int(machineSet.TypedSpec().Value.GetUpdateStrategyConfig().GetRolling().GetMaxParallelism())) - } - - if isControlPlane(machineSet) { - maxParallelism = 1 - } - - var items []machineWithPatches - - for iter := expectedMachines.Iterator(); iter.Next(); { - machineSetNode := iter.Value() - - // machine is locked, skip the machine - if _, locked := machineSetNode.Metadata().Annotations().Get(omni.MachineLocked); locked { - continue - } - - clusterMachine, ok := currentClusterMachines[machineSetNode.Metadata().ID()] + id, ok = input.Metadata().Labels().Get(omni.LabelMachineSet) if !ok { - continue - } - - configStatus, ok := configStatuses[clusterMachine.Metadata().ID()] - - // is the latest machine config applied to the machine? - outdated := !ok || isOutdated(clusterMachine, configStatus) - - item := pair.MakePair(clusterMachine, omni.NewClusterMachineConfigPatches(clusterMachine.Metadata().Namespace(), clusterMachine.Metadata().ID())) - - // are there any outstanding config patches to be applied? - updated, err := ctrl.updateClusterMachine(cluster, item, machineSet, configPatchHelper) - if err != nil { - return nil, fmt.Errorf("failed to update cluster machine: %w", err) - } - - // machine config patches are up-to-date, skip the machine - if !updated { - continue - } - - // the current machine needs to be updated - switch machineSet.TypedSpec().Value.UpdateStrategy { - case specs.MachineSetSpec_Rolling: - // return if we have reached the max parallelism for rolling strategy - if len(items) >= maxParallelism { - return items, nil - } - - // normal mode - there are no outdated machines, so simply add the current machine to the list, as it has a pending update - if outdatedMachines == 0 { - items = append(items, item) - - continue - } - - // prioritization mode - there are outdated machines: - // add the current machine (which has a pending update) to the list only if it is one of the outdated ones - // this handles the case of a "broken" config patch which never applies - if outdated { - items = append(items, item) - } - case specs.MachineSetSpec_Unset: - items = append(items, item) - } - } - - return items, nil -} - -func (ctrl *MachineSetStatusController) getMachinesToDestroy( - ctx context.Context, - r controller.Runtime, - machineSet *omni.MachineSet, - currentClusterMachines safe.List[*omni.ClusterMachine], - expectedMachines map[resource.ID]*omni.MachineSetNode, -) ([]*omni.ClusterMachine, error) { - var clusterMachines []*omni.ClusterMachine - - machineSetSpec := machineSet.TypedSpec().Value - maxParallelism := 0 - rolling := false - - // if the machine set has a rolling update strategy, use it - if machineSetSpec.GetUpdateStrategy() == specs.MachineSetSpec_Rolling { - rolling = true - maxParallelism = max(1, int(machineSetSpec.GetDeleteStrategyConfig().GetRolling().GetMaxParallelism())) // if the max parallelism is not set (is zero), use 1 - } - - // if this is a control plane machine set, override the strategy to always use rolling strategy with max parallelism of 1 - if isControlPlane(machineSet) { - rolling = true - maxParallelism = 1 - } - - for iter := currentClusterMachines.Iterator(); iter.Next(); { - clusterMachine := iter.Value() - - if rolling && len(clusterMachines) >= maxParallelism { - break - } - - if _, ok := expectedMachines[clusterMachine.Metadata().ID()]; !ok { - clusterMachines = append(clusterMachines, clusterMachine) - } - } - - if len(clusterMachines) == 0 { - return clusterMachines, nil - } - - if isControlPlane(machineSet) { - // block removing all machines for control plane machine set - if len(clusterMachines) == currentClusterMachines.Len() { return nil, nil } - status, err := check.EtcdStatus(ctx, r, machineSet) - if err != nil { - return nil, err - } - - if err = check.CanScaleDown(status, clusterMachines[0]); err != nil { - return nil, err - } - } - - return clusterMachines, nil -} - -func (ctrl *MachineSetStatusController) updateMachines( - ctx context.Context, - r controller.Runtime, - logger *zap.Logger, - machinesWithPatches []machineWithPatches, - created bool, -) error { - action := "update" - if created { - action = "create" + return []resource.Pointer{ + omni.NewMachineSet(resources.DefaultNamespace, id).Metadata(), + }, nil } - logger.Info(fmt.Sprintf("%s machines", action), zap.Strings("machines", xslices.Map(machinesWithPatches, func(m machineWithPatches) string { return m.F1.Metadata().ID() }))) - - for _, pair := range machinesWithPatches { - clusterMachine := pair.F1 + handler := &machineSetStatusHandler{} - clusterName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) - if !ok { - return fmt.Errorf("cluster machine update doesn't have %s label", omni.LabelCluster) - } - - if err := safe.WriterModify(ctx, r, omni.NewClusterMachineConfigPatches(resources.DefaultNamespace, clusterMachine.Metadata().ID()), - func(res *omni.ClusterMachineConfigPatches) error { - // update the value - res.TypedSpec().Value.Patches = pair.F2.TypedSpec().Value.Patches - - return nil + return qtransform.NewQController( + qtransform.Settings[*omni.MachineSet, *omni.MachineSetStatus]{ + Name: machineset.ControllerName, + MapMetadataFunc: func(machineSet *omni.MachineSet) *omni.MachineSetStatus { + return omni.NewMachineSetStatus(resources.DefaultNamespace, machineSet.Metadata().ID()) }, - ); err != nil { - return err - } - - if err := safe.WriterModify(ctx, r, clusterMachine, func(res *omni.ClusterMachine) error { - // don't update the ClusterMachine if it's still owned by another cluster - currentClusterName, ok := res.Metadata().Labels().Get(omni.LabelCluster) - if ok && currentClusterName != clusterName { - return nil - } + UnmapMetadataFunc: func(machineSetStatus *omni.MachineSetStatus) *omni.MachineSet { + return omni.NewMachineSet(resources.DefaultNamespace, machineSetStatus.Metadata().ID()) + }, + TransformExtraOutputFunc: handler.reconcileRunning, + FinalizerRemovalExtraOutputFunc: handler.reconcileTearingDown, + }, + qtransform.WithConcurrency(8), + qtransform.WithExtraMappedInput( + qtransform.MapperSameID[*omni.ControlPlaneStatus, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapByMachineSetLabel[*omni.MachineSetNode, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapByMachineSetLabel[*omni.ClusterMachineStatus, *omni.MachineSet](), + ), + qtransform.WithExtraMappedDestroyReadyInput( + mappers.MapByMachineSetLabel[*omni.ClusterMachine, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapByMachineSetLabel[*omni.ClusterMachineConfigStatus, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapClusterResourceToLabeledResources[*omni.ClusterSecrets, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapClusterResourceToLabeledResources[*omni.Cluster, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapClusterResourceToLabeledResources[*omni.TalosConfig, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapByMachineSetLabel[*omni.ClusterMachineIdentity, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + mappers.MapClusterResourceToLabeledResources[*omni.LoadBalancerStatus, *omni.MachineSet](), + ), + qtransform.WithExtraMappedInput( + // machine to machine set, if the machine is allocated + func(ctx context.Context, _ *zap.Logger, r controller.QRuntime, machine *omni.Machine) ([]resource.Pointer, error) { + clusterMachine, err := r.Get(ctx, omni.NewClusterMachine(resources.DefaultNamespace, machine.Metadata().ID()).Metadata()) + if err != nil { + if state.IsNotFoundError(err) { + return nil, nil + } + } - // update the labels - CopyAllLabels(clusterMachine, res) + machineSetID, ok := clusterMachine.Metadata().Labels().Get(omni.LabelMachineSet) + if !ok { + return nil, nil + } - // update the annotations to make sure that inputResourceVersion gets updated - CopyAllAnnotations(clusterMachine, res) + return []resource.Pointer{ + omni.NewMachineSet(resources.DefaultNamespace, machineSetID).Metadata(), + }, nil + }, + ), + qtransform.WithExtraMappedInput( + // config patch to machine set if the machine is allocated, checks by different layers, if is on the cluster layer, + // matches all machine sets + func(ctx context.Context, _ *zap.Logger, r controller.QRuntime, patch *omni.ConfigPatch) ([]resource.Pointer, error) { + clusterName, ok := patch.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + // no cluster, map by the machine ID + return mapMachineIDToMachineSet(ctx, r, patch, omni.LabelMachine) + } - if res.TypedSpec().Value.KubernetesVersion == "" { - res.TypedSpec().Value.KubernetesVersion = clusterMachine.TypedSpec().Value.KubernetesVersion - } + // cluster machine patch + pointers, err := mapMachineIDToMachineSet(ctx, r, patch, omni.LabelClusterMachine) + if err != nil { + return nil, err + } - return nil - }); err != nil { - return fmt.Errorf("error updating cluster machine %q: %w", clusterMachine.Metadata().ID(), err) - } + if pointers != nil { + return pointers, err + } - // hold the Machine via the finalizer - if err := r.AddFinalizer( - ctx, - omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), - ctrl.Name(), - ); err != nil { - return fmt.Errorf( - "error adding finalizer to machine %q in cluster %q: %w", - clusterMachine.Metadata().ID(), - clusterName, - err, - ) - } + // machine set level patch + machineSetID, ok := patch.Metadata().Labels().Get(omni.LabelMachineSet) + if ok { + return []resource.Pointer{ + omni.NewMachineSet(resources.DefaultNamespace, machineSetID).Metadata(), + }, nil + } - if created { - logger.Info("added the machine to the machine set", - zap.String("machine", clusterMachine.Metadata().ID()), - ) - } - } + // cluster level patch, find all machine sets in a cluster + list, err := r.List(ctx, omni.NewMachineSet(resources.DefaultNamespace, "").Metadata(), state.WithLabelQuery( + resource.LabelEqual(omni.LabelCluster, clusterName), + )) + if err != nil { + return nil, err + } - return nil + return xslices.Map(list.Items, func(r resource.Resource) resource.Pointer { return r.Metadata() }), nil + }, + ), + qtransform.WithExtraOutputs( + controller.Output{ + Type: omni.ClusterMachineType, + Kind: controller.OutputExclusive, + }, + controller.Output{ + Type: omni.ClusterMachineConfigPatchesType, + Kind: controller.OutputExclusive, + }, + ), + ) } -func (ctrl *MachineSetStatusController) destroyMachines(ctx context.Context, r controller.Runtime, logger *zap.Logger, machineSet *omni.MachineSet, clusterMachines []*omni.ClusterMachine) error { - var err error - - logger.Info("destroy machines", zap.Strings("machines", xslices.Map(clusterMachines, func(m *omni.ClusterMachine) string { return m.Metadata().ID() }))) +type machineSetStatusHandler struct{} +func (handler *machineSetStatusHandler) reconcileRunning(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, + machineSet *omni.MachineSet, machineSetStatus *omni.MachineSetStatus, +) error { clusterName, ok := machineSet.Metadata().Labels().Get(omni.LabelCluster) - if !ok { - return fmt.Errorf("failed to determine cluster name of the machine set %s", machineSet.Metadata().ID()) + if ok { + logger = logger.With(zap.String("cluster", clusterName)) } - secrets, err := safe.ReaderGet[*omni.ClusterSecrets](ctx, r, omni.NewClusterSecrets( - resources.DefaultNamespace, - clusterName, - ).Metadata()) - if err != nil { - if state.IsNotFoundError(err) { - return nil - } - - return fmt.Errorf("failed to get cluster %q secrets: %w", clusterName, err) - } + logger = logger.With(zap.String("machineset", machineSet.Metadata().ID())) - bundle, err := omni.ToSecretsBundle(secrets) - if err != nil { - return fmt.Errorf("failed to convert cluster %q secrets to bundle: %w", clusterName, err) - } - - nodeNameOccurences, clusterMachineIdentities, err := ctrl.getClusterMachineIdentities(ctx, r, clusterName) + rc, err := machineset.BuildReconciliationContext(ctx, r, machineSet) if err != nil { return err } - for _, clusterMachine := range clusterMachines { - var ( - ready bool - err error - ) - - clusterMachineIdentity := clusterMachineIdentities[clusterMachine.Metadata().ID()] - - if ready, err = r.Teardown(ctx, clusterMachine.Metadata()); err != nil { - return fmt.Errorf( - "error tearing down machine %q in cluster %q: %w", - clusterMachine.Metadata().ID(), - clusterName, - err, - ) - } - - if !ready { - continue - } - - if _, ok := machineSet.Metadata().Labels().Get(omni.LabelSkipTeardown); !ok && clusterMachineIdentity != nil && nodeNameOccurences[clusterMachineIdentity.TypedSpec().Value.Nodename] == 1 { - if err = ctrl.teardownNode(ctx, clusterMachine, clusterMachineIdentity); err != nil { - return fmt.Errorf("error tearing down node %q: %w", clusterMachineIdentity.TypedSpec().Value.Nodename, err) - } - } + // should run always + machineset.ReconcileStatus(rc, machineSetStatus) - if _, ok := machineSet.Metadata().Labels().Get(omni.LabelSkipTeardown); !ok && clusterMachineIdentity != nil { - if err = ctrl.deleteMember(ctx, r, bundle.Cluster.ID, clusterMachine); err != nil { - return fmt.Errorf( - "error deleting member %q: %w", - clusterMachineIdentity.TypedSpec().Value.Nodename, - err, - ) - } - } - - // release the Machine finalizer - if err = r.RemoveFinalizer( - ctx, - omni.NewMachine(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), - ctrl.Name(), - ); err != nil { - return fmt.Errorf( - "error removing finalizer from machine %q: %w", - clusterMachine.Metadata().ID(), - err, - ) - } - - if err = ctrl.destroyMachine(ctx, r, clusterMachine); err != nil { - return fmt.Errorf("error destroying machine %q: %w", clusterMachine.Metadata().ID(), err) - } - - logger.Info("removed the machine from the machine set gracefully", - zap.String("machine", clusterMachine.Metadata().ID()), - ) - } - - return nil -} - -func (ctrl *MachineSetStatusController) deleteMember( - ctx context.Context, - r controller.Runtime, - clusterID string, - clusterMachine *omni.ClusterMachine, -) error { - clusterMachineIdentity, err := safe.ReaderGet[*omni.ClusterMachineIdentity]( - ctx, - r, - omni.NewClusterMachineIdentity(resources.DefaultNamespace, clusterMachine.Metadata().ID()).Metadata(), - ) + requeue, err := handler.reconcileMachines(ctx, r, logger, rc) if err != nil { - if state.IsNotFoundError(err) { - return nil - } - - return fmt.Errorf("error getting identity: %w", err) + return err } - ctx, cancel := context.WithTimeout(ctx, time.Second*5) - defer cancel() - - _, err = ctrl.discoveryClient.AffiliateDelete(ctx, &serverpb.AffiliateDeleteRequest{ - ClusterId: clusterID, - AffiliateId: clusterMachineIdentity.TypedSpec().Value.NodeIdentity, - }) - if err != nil { - return fmt.Errorf( - "error deleting member %q: %w", - clusterMachineIdentity.TypedSpec().Value.NodeIdentity, - err, - ) + if requeue { + return controller.NewRequeueInterval(requeueInterval) } return nil } -func (ctrl *MachineSetStatusController) updateClusterMachine( - cluster *omni.Cluster, - pair machineWithPatches, - machineSet *omni.MachineSet, - configPatchHelper *configpatch.Helper, -) (bool, error) { - clusterMachine, clusterMachineConfigPatches := pair.F1, pair.F2 - - patches, err := configPatchHelper.Get(clusterMachine, machineSet) +func (handler *machineSetStatusHandler) reconcileTearingDown(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, machineSet *omni.MachineSet) error { + rc, err := machineset.BuildReconciliationContext(ctx, r, machineSet) if err != nil { - return false, err - } - - if !UpdateInputsVersions(clusterMachine, patches...) && clusterMachine.TypedSpec().Value.KubernetesVersion != "" { - return false, nil - } - - patchesRaw := make([]string, 0, len(patches)) - for _, p := range patches { - patchesRaw = append(patchesRaw, p.TypedSpec().Value.Data) - } - - clusterMachineConfigPatches.TypedSpec().Value.Patches = patchesRaw - clusterMachine.TypedSpec().Value.KubernetesVersion = cluster.TypedSpec().Value.KubernetesVersion // this will only be applied once - - return true, nil -} - -func (ctrl *MachineSetStatusController) destroyMachine(ctx context.Context, r controller.Runtime, clusterMachine *omni.ClusterMachine) error { - configPatches := omni.NewClusterMachineConfigPatches(clusterMachine.Metadata().Namespace(), clusterMachine.Metadata().ID()) - - if err := r.Destroy(ctx, configPatches.Metadata()); err != nil && !state.IsNotFoundError(err) { return err } - if err := r.Destroy(ctx, clusterMachine.Metadata()); err != nil && !state.IsNotFoundError(err) { - return err - } - - return nil -} - -// destroyMachinesNoWait kicks in when the machine is in tearing down phase. -// it removes all machines without waiting for them to be healthy, ignores upgrade strategy. -func (ctrl *MachineSetStatusController) destroyMachinesNoWait( - ctx context.Context, - r controller.Runtime, - logger *zap.Logger, - machineSet *omni.MachineSet, - allClusterMachines safe.List[*omni.ClusterMachine], -) error { - list := allClusterMachines.FilterLabelQuery(resource.LabelEqual(omni.LabelMachineSet, machineSet.Metadata().ID())) - - removeFinalizer := true + clusterMachinesCount := uint32(len(rc.GetClusterMachines())) + // no cluster machines release the finalizer + if clusterMachinesCount == 0 { + logger.Info("machineset torn down", zap.String("machineset", machineSet.Metadata().ID())) - for iter := list.Iterator(); iter.Next(); { - ready, err := r.Teardown(ctx, iter.Value().Metadata()) - if err != nil { - return fmt.Errorf("error tearing down machine %q: %w", iter.Value().Metadata().ID(), err) - } - - if iter.Value().Metadata().Phase() == resource.PhaseRunning { - // if it's the first time we attempt a tear down - logger.Info("tearing down the machine from the machine set (no wait)", - zap.String("machine", iter.Value().Metadata().ID()), - ) - } - - if !ready { - removeFinalizer = false - - continue - } - - // release the Machine finalizer - if err = r.RemoveFinalizer( - ctx, - omni.NewMachine(resources.DefaultNamespace, iter.Value().Metadata().ID()).Metadata(), - ctrl.Name(), - ); err != nil { - return fmt.Errorf("error removing finalizer from machine %q: %w", iter.Value().Metadata().ID(), err) - } - - if err = ctrl.destroyMachine(ctx, r, iter.Value()); err != nil { - return fmt.Errorf("error destroying machine %q: %w", iter.Value().Metadata().ID(), err) - } - - logger.Info("removed the machine from the machine set (no wait)", - zap.String("machine", iter.Value().Metadata().ID()), - ) + return nil } - if removeFinalizer { - err := r.RemoveFinalizer(ctx, machineSet.Metadata(), ctrl.Name()) - if err != nil { - return fmt.Errorf("error removing finalizer from machine set %q: %w", machineSet.Metadata().ID(), err) + err = safe.WriterModify[*omni.MachineSetStatus](ctx, r, omni.NewMachineSetStatus(resources.DefaultNamespace, machineSet.Metadata().ID()), func(status *omni.MachineSetStatus) error { + status.TypedSpec().Value.Phase = specs.MachineSetPhase_Destroying + status.TypedSpec().Value.Ready = false + status.TypedSpec().Value.Machines = &specs.Machines{ + Total: clusterMachinesCount, + Healthy: 0, } return nil - } - - return nil -} - -func (ctrl *MachineSetStatusController) teardownNode( - ctx context.Context, - clusterMachine *omni.ClusterMachine, - clusterMachineIdentity *omni.ClusterMachineIdentity, -) error { - clusterName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) - if !ok { - return fmt.Errorf("cluster machine %s doesn't have cluster label set", clusterMachine.Metadata().ID()) - } - - type kubeRuntime interface { - GetClient(ctx context.Context, cluster string) (*kubernetes.Client, error) - } - - k8s, err := runtime.LookupInterface[kubeRuntime](kubernetes.Name) + }) if err != nil { return err } - k8sClient, err := k8s.GetClient(ctx, clusterName) - if err != nil { - return fmt.Errorf("error getting kubernetes client for cluster %q: %w", clusterName, err) - } - - ctx, cancel := context.WithTimeout(ctx, time.Second*5) - defer cancel() - - nodename := clusterMachineIdentity.TypedSpec().Value.Nodename - - err = k8sClient.Clientset().CoreV1().Nodes().Delete(ctx, nodename, metav1.DeleteOptions{}) - if err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("error deleting node %q in cluster %q: %w", nodename, clusterName, err) - } - - return nil -} - -func (ctrl *MachineSetStatusController) createDiscoveryClient(ctx context.Context) (*grpc.ClientConn, error) { - u, err := url.Parse(constants.DefaultDiscoveryServiceEndpoint) - if err != nil { - return nil, err + if _, err := handler.reconcileMachines(ctx, r, logger, rc); err != nil { + return err } - discoveryConn, err := grpc.DialContext(ctx, net.JoinHostPort(u.Host, "443"), - grpc.WithTransportCredentials( - credentials.NewTLS(&tls.Config{}), - ), - grpc.WithSharedWriteBuffer(true), - ) - if err != nil { - return nil, err + // teardown complete, ignore requeue and unlock the finalizer now + if len(rc.GetRunningClusterMachines()) == 0 { + return nil } - ctrl.discoveryClient = serverpb.NewClusterClient(discoveryConn) - - return discoveryConn, nil -} - -func isOutdated(clusterMachine *omni.ClusterMachine, configStatus *omni.ClusterMachineConfigStatus) bool { - return configStatus.TypedSpec().Value.ClusterMachineVersion != clusterMachine.Metadata().Version().String() || configStatus.TypedSpec().Value.LastConfigError != "" + return controller.NewRequeueErrorf(requeueInterval, "the machine set still has cluster machines") } -func (ctrl *MachineSetStatusController) getClusterMachineIdentities( - ctx context.Context, - r controller.Runtime, - clusterName string, -) (map[string]int, map[string]*omni.ClusterMachineIdentity, error) { - list, err := safe.ReaderListAll[*omni.ClusterMachineIdentity]( - ctx, - r, - state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, clusterName)), - ) - if err != nil { - return nil, nil, fmt.Errorf("error listing cluster %q machine identities: %w", clusterName, err) +func (handler *machineSetStatusHandler) reconcileMachines(ctx context.Context, r controller.ReaderWriter, logger *zap.Logger, rc *machineset.ReconciliationContext) (bool, error) { + if err := machineset.UpdateFinalizers(ctx, r, rc); err != nil { + return false, err } - nodeNameOccurences := map[string]int{} - clusterMachineIdentities := map[string]*omni.ClusterMachineIdentity{} - - list.ForEach(func(res *omni.ClusterMachineIdentity) { - clusterMachineIdentities[res.Metadata().ID()] = res - nodeNameOccurences[res.TypedSpec().Value.Nodename]++ - }) - - return nodeNameOccurences, clusterMachineIdentities, nil -} - -func isControlPlane(res resource.Resource) bool { - _, found := res.Metadata().Labels().Get(omni.LabelControlPlaneRole) - - return found + // return requeue as separate flag and return requeue in the end of the function + return machineset.ReconcileMachines(ctx, r, logger, rc) } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_set_status_test.go b/internal/backend/runtime/omni/controllers/omni/machine_set_status_test.go index 52eb7c96f..d8059fcef 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_set_status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_set_status_test.go @@ -30,6 +30,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/client/pkg/omni/resources/siderolink" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" ) @@ -81,7 +82,6 @@ func (suite *MachineSetStatusSuite) createMachineSetWithOpts(clusterName string, machineSet := omni.NewMachineSet(resources.DefaultNamespace, machineSetName) machineSet.Metadata().Labels().Set(omni.LabelCluster, clusterName) - machineSet.Metadata().Labels().Set(omni.LabelSkipTeardown, "") spec := machineSet.TypedSpec().Value loadbalancer := omni.NewLoadBalancerStatus(resources.DefaultNamespace, clusterName) @@ -181,6 +181,7 @@ func (suite *MachineSetStatusSuite) createMachineSetWithOpts(clusterName string, func (suite *MachineSetStatusSuite) updateStage(nodes []string, stage specs.ClusterMachineStatusSpec_Stage, ready bool) { for _, node := range nodes { cms := omni.NewClusterMachineStatus(resources.DefaultNamespace, node) + cms.Metadata().Labels().Set(omni.MachineStatusLabelConnected, "") spec := cms.TypedSpec().Value spec.Ready = ready @@ -194,14 +195,14 @@ func (suite *MachineSetStatusSuite) updateStage(nodes []string, stage specs.Clus )) suite.Require().NoError(err) - omnictrl.CopyLabels(machine, cms, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) + helpers.CopyLabels(machine, cms, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) err = suite.state.Create(suite.ctx, cms) if state.IsConflictError(err) { _, err = safe.StateUpdateWithConflicts(suite.ctx, suite.state, cms.Metadata(), func(res *omni.ClusterMachineStatus) error { res.TypedSpec().Value = cms.TypedSpec().Value - omnictrl.CopyLabels(machine, res, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) + helpers.CopyLabels(machine, res, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) return nil }) @@ -225,7 +226,7 @@ func (suite *MachineSetStatusSuite) syncConfig(nodes []string) { spec := cmcs.TypedSpec().Value spec.ClusterMachineVersion = machine.Metadata().Version().String() - omnictrl.CopyLabels(machine, cmcs, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) + helpers.CopyLabels(machine, cmcs, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) err = suite.state.Create(suite.ctx, cmcs) if state.IsConflictError(err) { @@ -233,7 +234,7 @@ func (suite *MachineSetStatusSuite) syncConfig(nodes []string) { res.TypedSpec().Value = cmcs.TypedSpec().Value res.TypedSpec().Value.ClusterMachineVersion = machine.Metadata().Version().String() - omnictrl.CopyLabels(machine, res, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) + helpers.CopyLabels(machine, res, omni.LabelControlPlaneRole, omni.LabelCluster, omni.LabelMachineSet, omni.LabelWorkerRole) return nil }) @@ -248,7 +249,7 @@ func (suite *MachineSetStatusSuite) SetupTest() { suite.startRuntime() - suite.Require().NoError(suite.runtime.RegisterController(&omnictrl.MachineSetStatusController{})) + suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewMachineSetStatusController())) // create siderolink config as it's endpoint is used while generating kubernetes endpoint siderolink := siderolink.NewConfig(resources.DefaultNamespace) @@ -292,15 +293,12 @@ func (suite *MachineSetStatusSuite) TestScaleDown() { "scaledown-3", } - // We create machine with "Healthy=false" status, so that state.Destroy will not work. + // We create machine with "Healthy=false" status, so that rtestutils.Destroy will not work. machineSet := suite.createMachineSetWithOpts(clusterName, "machine-set-scale-down", machines, withHealthy(false)) suite.assertMachinesState(machines, clusterName, machineSet.Metadata().ID()) - suite.Require().NoError(suite.state.Destroy( - suite.ctx, - resource.NewMetadata(resources.DefaultNamespace, omni.MachineSetNodeType, machines[0], resource.VersionUndefined), - )) + rtestutils.Destroy[*omni.MachineSetNode](suite.ctx, suite.T(), suite.state, []string{machines[0]}) suite.assertMachineSetPhase(machineSet, specs.MachineSetPhase_ScalingUp) @@ -308,6 +306,8 @@ func (suite *MachineSetStatusSuite) TestScaleDown() { suite.assertMachinesState(expectedMachines, clusterName, machineSet.Metadata().ID()) + suite.updateStage(expectedMachines, specs.ClusterMachineStatusSpec_RUNNING, true) + loadbalancer := omni.NewLoadBalancerStatus(resources.DefaultNamespace, clusterName) _, err := safe.StateUpdateWithConflicts( suite.ctx, @@ -321,17 +321,10 @@ func (suite *MachineSetStatusSuite) TestScaleDown() { ) suite.Require().NoError(err) - suite.updateStage(expectedMachines, specs.ClusterMachineStatusSpec_RUNNING, true) - - suite.assertMachineSetPhase(machineSet, specs.MachineSetPhase_Running) - suite.Assert().NoError(retry.Constant(5*time.Second, retry.WithUnits(100*time.Millisecond)).Retry( suite.assertNoResource(*omni.NewClusterMachine(resources.DefaultNamespace, expectedMachines[0]).Metadata()), )) - // this should be destroyed by the cluster machine status controller, simulate it here - suite.Assert().NoError(suite.state.Destroy(suite.ctx, omni.NewClusterMachineStatus(resources.DefaultNamespace, expectedMachines[0]).Metadata())) - suite.assertMachineSetPhase(machineSet, specs.MachineSetPhase_Running) } @@ -376,7 +369,7 @@ func (suite *MachineSetStatusSuite) TestScaleDownWithMaxParallelism() { suite.Require().NoError(err) for _, machine := range machines[1:] { - suite.Require().NoError(suite.state.Destroy(suite.ctx, omni.NewMachineSetNode(resources.DefaultNamespace, machine, machineSet).Metadata())) + rtestutils.Destroy[*omni.MachineSetNode](suite.ctx, suite.T(), suite.state, []string{machine}) } getTearingDownClusterMachines := func() []*omni.ClusterMachine { @@ -453,11 +446,11 @@ func (suite *MachineSetStatusSuite) TestConfigUpdate() { "patched3", } - machineSet := suite.createMachineSet(clusterName, "machine-set-scale-up", machines, `machine: + machineSet := suite.createMachineSet(clusterName, "machine-set-configs-update", machines, `machine: install: disk: /dev/vdb`) - // initially, each machine must have 2 config patches + // initially, each machine should have 2 config patches for _, m := range machines { assertResource( &suite.OmniSuite, @@ -494,6 +487,20 @@ func (suite *MachineSetStatusSuite) TestConfigUpdate() { network: hostname: the-running-node-cluster-machine-patch` + rtestutils.AssertResource[*omni.MachineSetStatus](suite.ctx, suite.T(), suite.state, machineSet.Metadata().ID(), func(r *omni.MachineSetStatus, assertion *assert.Assertions) { + assertion.True( + r.TypedSpec().Value.Machines.EqualVT( + &specs.Machines{ + Total: 3, + Healthy: 2, + Connected: 2, + Requested: 3, + }, + ), + "status %#v", r.TypedSpec().Value.Machines, + ) + }) + suite.Assert().NoError(suite.state.Create(suite.ctx, machinePatch)) // create a Machine-level patch for the running machine[0] @@ -603,6 +610,12 @@ func (suite *MachineSetStatusSuite) TestConfigUpdateWithMaxParallelism() { suite.Require().NoError(suite.state.Create(suite.ctx, machineSetPatch)) + rtestutils.AssertResources[*omni.ClusterMachine](suite.ctx, suite.T(), suite.state, machines, func(r *omni.ClusterMachine, assert *assert.Assertions) { + _, ok := r.Metadata().Annotations().Get(helpers.InputResourceVersionAnnotation) + + assert.True(ok) + }) + expectEvents := func(num int) []resource.ID { ids := make([]resource.ID, 0, num) @@ -695,10 +708,7 @@ func (suite *MachineSetStatusSuite) TestTeardown() { "test", )) - suite.Require().NoError(suite.state.Destroy( - suite.ctx, - resource.NewMetadata(resources.DefaultNamespace, omni.MachineSetNodeType, machines[0], resource.VersionUndefined), - )) + rtestutils.Destroy[*omni.MachineSetNode](suite.ctx, suite.T(), suite.state, []string{machines[0]}) suite.assertMachineSetPhase(machineSet, specs.MachineSetPhase_ScalingDown) @@ -741,13 +751,7 @@ func (suite *MachineSetStatusSuite) TestTeardown() { )) suite.Assert().NoError(suite.state.Destroy(suite.ctx, machineSet.Metadata())) - for _, machine := range machines { - if err := suite.state.Destroy(suite.ctx, - resource.NewMetadata(resources.DefaultNamespace, omni.MachineSetNodeType, machine, resource.VersionUndefined), - ); err != nil && !state.IsNotFoundError(err) { - suite.Require().NoError(err) - } - } + rtestutils.Destroy[*omni.MachineSetNode](suite.ctx, suite.T(), suite.state, machines) }) { break } @@ -780,6 +784,20 @@ func (suite *MachineSetStatusSuite) TestMachineLocks() { suite.Require().NoError(err) + rtestutils.AssertResource[*omni.MachineSetStatus](suite.ctx, suite.T(), suite.state, machineSet.Metadata().ID(), func(r *omni.MachineSetStatus, assertion *assert.Assertions) { + assertion.True( + r.TypedSpec().Value.Machines.EqualVT( + &specs.Machines{ + Total: 3, + Healthy: 3, + Connected: 3, + Requested: 3, + }, + ), + "status %#v", r.TypedSpec().Value.Machines, + ) + }) + patch := omni.NewConfigPatch( resources.DefaultNamespace, machineSet.Metadata().ID()+"-patch", diff --git a/internal/backend/runtime/omni/controllers/omni/machine_status.go b/internal/backend/runtime/omni/controllers/omni/machine_status.go index 521a5741b..593710fe2 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_status.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_status.go @@ -20,6 +20,7 @@ import ( "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/task/machine" ) @@ -244,7 +245,7 @@ func (ctrl *MachineStatusController) reconcileCollectors(ctx context.Context, r } } - CopyUserLabels(m, ctrl.mergeLabels(m, machineLabels[m.Metadata().ID()])) + helpers.CopyUserLabels(m, ctrl.mergeLabels(m, machineLabels[m.Metadata().ID()])) omni.MachineStatusReconcileLabels(m) diff --git a/internal/backend/runtime/omni/controllers/omni/omni.go b/internal/backend/runtime/omni/controllers/omni/omni.go index 696220011..f409f9b12 100644 --- a/internal/backend/runtime/omni/controllers/omni/omni.go +++ b/internal/backend/runtime/omni/controllers/omni/omni.go @@ -8,57 +8,16 @@ package omni import ( "context" - "crypto/sha256" - "encoding/hex" "fmt" - "strings" "github.com/cosi-project/runtime/pkg/controller" "github.com/cosi-project/runtime/pkg/controller/generic" "github.com/cosi-project/runtime/pkg/controller/generic/cleanup" "github.com/cosi-project/runtime/pkg/resource" - "github.com/cosi-project/runtime/pkg/resource/kvutils" "github.com/cosi-project/runtime/pkg/state" - "github.com/siderolabs/gen/xslices" "go.uber.org/zap" - - "github.com/siderolabs/omni/client/pkg/omni/resources/omni" ) -const inputResourceVersionAnnotation = "inputResourceVersion" - -// UpdateInputsVersions generates a hash of the resource by combining its inputs. -func UpdateInputsVersions[T resource.Resource](out resource.Resource, inputs ...T) bool { - return UpdateInputsAnnotation(out, xslices.Map(inputs, func(input T) string { - return fmt.Sprintf("%s/%s@%s", input.Metadata().Type(), input.Metadata().ID(), input.Metadata().Version()) - })...) -} - -// UpdateInputsAnnotation updates the annotation with the input resource version and returns if it has changed. -func UpdateInputsAnnotation(out resource.Resource, versions ...string) bool { - hash := sha256.New() - - for i, version := range versions { - if i > 0 { - hash.Write([]byte(",")) - } - - hash.Write([]byte(version)) - } - - inVersion := hex.EncodeToString(hash.Sum(nil)) - - version, found := out.Metadata().Annotations().Get(inputResourceVersionAnnotation) - - if found && version == inVersion { - return false - } - - out.Metadata().Annotations().Set(inputResourceVersionAnnotation, inVersion) - - return true -} - func trackResource(r controller.ReaderWriter, ns resource.Namespace, resourceType resource.Type, listOptions ...state.ListOption) *resourceTracker { return &resourceTracker{ touched: map[string]struct{}{}, @@ -138,80 +97,6 @@ func (rt *resourceTracker) BeforeDestroy(f func(res resource.Resource) error) { rt.beforeDestroyCallback = f } -// CopyAllLabels copies all labels from one resource to another. -func CopyAllLabels(src, dst resource.Resource) { - dst.Metadata().Labels().Do(func(tmp kvutils.TempKV) { - for key, value := range src.Metadata().Labels().Raw() { - tmp.Set(key, value) - } - }) -} - -// CopyLabels copies the labels from one resource to another. -func CopyLabels(src, dst resource.Resource, keys ...string) { - dst.Metadata().Labels().Do(func(tmp kvutils.TempKV) { - for _, key := range keys { - if label, ok := src.Metadata().Labels().Get(key); ok { - tmp.Set(key, label) - } - } - }) -} - -// CopyAllAnnotations copies all annotations from one resource to another. -func CopyAllAnnotations(src, dst resource.Resource) { - dst.Metadata().Annotations().Do(func(tmp kvutils.TempKV) { - for key, value := range src.Metadata().Annotations().Raw() { - tmp.Set(key, value) - } - }) -} - -// CopyAnnotations copies annotations from one resource to another. -func CopyAnnotations(src, dst resource.Resource, annotations ...string) { - dst.Metadata().Annotations().Do(func(tmp kvutils.TempKV) { - for _, key := range annotations { - if label, ok := src.Metadata().Annotations().Get(key); ok { - tmp.Set(key, label) - } - } - }) -} - -// CopyUserLabels copies all user labels from one resource to another. -// It removes all user labels on the target that are not present in the source resource. -// System labels are not copied. -func CopyUserLabels(target resource.Resource, labels map[string]string) { - ClearUserLabels(target) - - if len(labels) == 0 { - return - } - - target.Metadata().Labels().Do(func(tmp kvutils.TempKV) { - for key, value := range labels { - if strings.HasPrefix(key, omni.SystemLabelPrefix) { - continue - } - - tmp.Set(key, value) - } - }) -} - -// ClearUserLabels removes all user labels from the resource. -func ClearUserLabels(res resource.Resource) { - res.Metadata().Labels().Do(func(tmp kvutils.TempKV) { - for key := range res.Metadata().Labels().Raw() { - if strings.HasPrefix(key, omni.SystemLabelPrefix) { - continue - } - - tmp.Delete(key) - } - }) -} - // withFinalizerCheck wraps a [cleanup.Handler] with a check that needs to pass before the handler is called. func withFinalizerCheck[Input generic.ResourceWithRD](handler cleanup.Handler[Input], check func(input Input) error) cleanup.Handler[Input] { return &cleanupChecker[Input]{ diff --git a/internal/backend/runtime/omni/controllers/omni/omni_test.go b/internal/backend/runtime/omni/controllers/omni/omni_test.go index dc5020f97..b9e9f399c 100644 --- a/internal/backend/runtime/omni/controllers/omni/omni_test.go +++ b/internal/backend/runtime/omni/controllers/omni/omni_test.go @@ -16,7 +16,6 @@ import ( "slices" "sync" "sync/atomic" - "testing" "time" cosiv1alpha1 "github.com/cosi-project/runtime/api/v1alpha1" @@ -45,7 +44,6 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources/omni" rt "github.com/siderolabs/omni/internal/backend/runtime" "github.com/siderolabs/omni/internal/backend/runtime/kubernetes" - omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" ) const ( @@ -568,26 +566,6 @@ func (suite *OmniSuite) destroyClusterByID(clusterID string) { assertNoResource[*omni.EtcdBackupStatus](suite, omni.NewEtcdBackupStatus(clusterID)) } -func TestUpdateInputsVersions(t *testing.T) { - out := omni.NewCluster("default", "test") - - in := []resource.Resource{omni.NewMachine("default", "test1"), omni.NewMachine("default", "test2")} - - assert.True(t, omnictrl.UpdateInputsVersions(out, in...)) - - v, _ := out.Metadata().Annotations().Get("inputResourceVersion") - assert.Equal(t, "a7a451e614fc3b4a7241798235001fea271c7ad5493c392f0a012104379bdb89", v) - - assert.False(t, omnictrl.UpdateInputsVersions(out, in...)) - - in = append(in, omni.NewClusterMachine("default", "cm1")) - - assert.True(t, omnictrl.UpdateInputsVersions(out, in...)) - - v, _ = out.Metadata().Annotations().Get("inputResourceVersion") - assert.Equal(t, "df4af53c3caf7ae4c0446bcf8b854ed3f5740a47eab0e5151f1962a4a4d52f6f", v) -} - type dynamicStateBuilder struct { //nolint:govet mx sync.Mutex m map[resource.Namespace]state.CoreState diff --git a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go index c1853bbdf..0fb479930 100644 --- a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go +++ b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go @@ -23,6 +23,7 @@ import ( "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/mappers" "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni/internal/talos" ) @@ -383,6 +384,7 @@ func reconcileTalosUpdateStatus(ctx context.Context, r controller.ReaderWriter, if versionMismatch { if upgradeStatus.TypedSpec().Value.Phase == specs.TalosUpgradeStatusSpec_Upgrading && upgradeStatus.TypedSpec().Value.Status == "" { + upgradeStatus.TypedSpec().Value.Step = "update paused" upgradeStatus.TypedSpec().Value.Status = "waiting for the cluster to be ready" } @@ -475,7 +477,7 @@ func updateMachine(ctx context.Context, r controller.ReaderWriter, logger *zap.L func createInitialTalosVersion(ctx context.Context, r controller.ReaderWriter, machine *omni.ClusterMachine, talosVersion, schematicID string) error { res := omni.NewClusterMachineTalosVersion(resources.DefaultNamespace, machine.Metadata().ID()) - CopyAllLabels(machine, res) + helpers.CopyAllLabels(machine, res) res.TypedSpec().Value.TalosVersion = talosVersion res.TypedSpec().Value.SchematicId = schematicID diff --git a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status_test.go b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status_test.go index 7921b1df4..c4a53f7dd 100644 --- a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status_test.go @@ -19,6 +19,7 @@ import ( "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" ) @@ -29,9 +30,6 @@ type TalosUpgradeStatusSuite struct { func (suite *TalosUpgradeStatusSuite) TestReconcile() { suite.startRuntime() - suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewTalosUpgradeStatusController())) - suite.Require().NoError(suite.runtime.RegisterController(&omnictrl.MachineSetStatusController{})) - clusterName := "talos-upgrade-cluster" cluster, machines := suite.createCluster(clusterName, 3, 1) @@ -43,6 +41,8 @@ func (suite *TalosUpgradeStatusSuite) TestReconcile() { suite.Require().NoError(suite.state.Create(suite.ctx, clusterStatus)) + suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewTalosUpgradeStatusController())) + for _, res := range machines { assertResource( &suite.OmniSuite, @@ -55,8 +55,12 @@ func (suite *TalosUpgradeStatusSuite) TestReconcile() { ) configStatus := omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, res.Metadata().ID()) + + helpers.CopyAllLabels(res, configStatus) + configStatus.TypedSpec().Value.ClusterMachineConfigSha256 = "aaaa" configStatus.TypedSpec().Value.TalosVersion = cluster.TypedSpec().Value.TalosVersion + configStatus.TypedSpec().Value.SchematicId = defaultSchematic suite.Require().NoError(suite.state.Create(suite.ctx, configStatus)) } @@ -199,7 +203,7 @@ func (suite *TalosUpgradeStatusSuite) TestUpdateVersionsMaintenance() { suite.startRuntime() suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewTalosUpgradeStatusController())) - suite.Require().NoError(suite.runtime.RegisterController(&omnictrl.MachineSetStatusController{})) + suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewMachineSetStatusController())) clusterName := "talos-upgrade-cluster" diff --git a/internal/backend/runtime/omni/controllers/omni/talosconfig.go b/internal/backend/runtime/omni/controllers/omni/talosconfig.go index 975a28284..d5e95cab9 100644 --- a/internal/backend/runtime/omni/controllers/omni/talosconfig.go +++ b/internal/backend/runtime/omni/controllers/omni/talosconfig.go @@ -21,6 +21,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/client/pkg/omni/resources/system" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" "github.com/siderolabs/omni/internal/pkg/certs" ) @@ -47,7 +48,7 @@ func NewTalosConfigController(certificateValidity time.Duration) *TalosConfigCon } // should always call UpdateInputsVersions to update the annotations, due to short-circuiting - if !UpdateInputsVersions(talosConfig, secrets) && !staleCertificate { + if !helpers.UpdateInputsVersions(talosConfig, secrets) && !staleCertificate { return nil } diff --git a/internal/backend/runtime/omni/migration/helpers.go b/internal/backend/runtime/omni/migration/helpers.go index a4eb56002..bc4765cbf 100644 --- a/internal/backend/runtime/omni/migration/helpers.go +++ b/internal/backend/runtime/omni/migration/helpers.go @@ -14,6 +14,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" ) @@ -63,7 +64,7 @@ func reconcileConfigInputs(ctx context.Context, st state.State, item *omni.Clust } _, err = safe.StateUpdateWithConflicts(ctx, st, config.Metadata(), func(machineConfig *omni.ClusterMachineConfig) error { - omnictrl.UpdateInputsVersions(machineConfig, inputs...) + helpers.UpdateInputsVersions(machineConfig, inputs...) machineConfig.TypedSpec().Value.ClusterMachineVersion = item.Metadata().Version().String() diff --git a/internal/backend/runtime/omni/migration/migrations.go b/internal/backend/runtime/omni/migration/migrations.go index e78f2d527..45f9a7294 100644 --- a/internal/backend/runtime/omni/migration/migrations.go +++ b/internal/backend/runtime/omni/migration/migrations.go @@ -26,6 +26,7 @@ import ( "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/client/pkg/omni/resources/registry" "github.com/siderolabs/omni/client/pkg/omni/resources/siderolink" + "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/helpers" omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni" "github.com/siderolabs/omni/internal/pkg/auth/role" "github.com/siderolabs/omni/internal/pkg/auth/scope" @@ -99,7 +100,7 @@ func deprecateClusterMachineTemplates(ctx context.Context, s state.State, _ *zap pair.MakePair("machine-uuid", iter.Value().Metadata().ID()), ) - omnictrl.CopyLabels(iter.Value(), patch, deprecatedCluster) + helpers.CopyLabels(iter.Value(), patch, deprecatedCluster) if err = createOrUpdate(ctx, s, patch, func(p *omni.ConfigPatch) error { p.TypedSpec().Value.Data = item.Value.Patch @@ -117,7 +118,7 @@ func deprecateClusterMachineTemplates(ctx context.Context, s state.State, _ *zap pair.MakePair("machine-uuid", iter.Value().Metadata().ID()), ) - omnictrl.CopyLabels(iter.Value(), patch, deprecatedCluster) + helpers.CopyLabels(iter.Value(), patch, deprecatedCluster) if err = createOrUpdate(ctx, s, patch, func(p *omni.ConfigPatch) error { var config struct { @@ -178,7 +179,7 @@ func clusterMachinesToMachineSets(ctx context.Context, s state.State, logger *za if _, ok = machineSets[machineSetID]; !ok { machineSets[machineSetID] = omni.NewMachineSet(resources.DefaultNamespace, machineSetID) - omnictrl.CopyLabels(item, machineSets[machineSetID], deprecatedCluster, deprecatedWorkerRole, deprecatedControlPlaneRole) + helpers.CopyLabels(item, machineSets[machineSetID], deprecatedCluster, deprecatedWorkerRole, deprecatedControlPlaneRole) } var patches []*omni.ConfigPatch @@ -191,9 +192,11 @@ func clusterMachinesToMachineSets(ctx context.Context, s state.State, logger *za _, err = safe.StateUpdateWithConflicts(ctx, s, item.Metadata(), func(res *omni.ClusterMachine) error { res.Metadata().Labels().Set("machine-set", machineSetID) - omnictrl.UpdateInputsVersions(res, patches...) + helpers.UpdateInputsVersions(res, patches...) - return res.Metadata().SetOwner((&omnictrl.MachineSetStatusController{}).Name()) + owner := omnictrl.NewMachineSetController().ControllerName + + return res.Metadata().SetOwner(owner) }, state.WithExpectedPhaseAny(), state.WithUpdateOwner(item.Metadata().Owner())) if err != nil { return err @@ -201,7 +204,7 @@ func clusterMachinesToMachineSets(ctx context.Context, s state.State, logger *za machineSetNode := omni.NewMachineSetNode(resources.DefaultNamespace, item.Metadata().ID(), machineSets[machineSetID]) if err = createOrUpdate(ctx, s, machineSetNode, func(res *omni.MachineSetNode) error { - omnictrl.CopyLabels(machineSetNode, res, deprecatedCluster, deprecatedWorkerRole) + helpers.CopyLabels(machineSetNode, res, deprecatedCluster, deprecatedWorkerRole) res.TypedSpec().Value = machineSetNode.TypedSpec().Value @@ -213,7 +216,7 @@ func clusterMachinesToMachineSets(ctx context.Context, s state.State, logger *za for _, ms := range machineSets { if err = createOrUpdate(ctx, s, ms, func(res *omni.MachineSet) error { - omnictrl.CopyLabels(ms, res, deprecatedCluster, deprecatedWorkerRole) + helpers.CopyLabels(ms, res, deprecatedCluster, deprecatedWorkerRole) res.TypedSpec().Value = ms.TypedSpec().Value @@ -808,7 +811,7 @@ func lowercaseAllIdentities(ctx context.Context, st state.State, _ *zap.Logger) switch { case existing == nil: res := auth.NewIdentity(identity.Metadata().Namespace(), lowercase) - omnictrl.CopyAllLabels(identity, res) + helpers.CopyAllLabels(identity, res) res.TypedSpec().Value = identity.TypedSpec().Value @@ -818,7 +821,7 @@ func lowercaseAllIdentities(ctx context.Context, st state.State, _ *zap.Logger) case existing.Metadata().Created().After(identity.Metadata().Created()): default: if _, err = safe.StateUpdateWithConflicts(ctx, st, existing.Metadata(), func(res *auth.Identity) error { - omnictrl.CopyAllLabels(identity, res) + helpers.CopyAllLabels(identity, res) res.TypedSpec().Value = identity.TypedSpec().Value @@ -879,11 +882,11 @@ func removeConfigPatchesFromClusterMachines(ctx context.Context, st state.State, } return items.ForEachErr(func(item *omni.ClusterMachine) error { - owner := (&omnictrl.MachineSetStatusController{}).Name() + owner := omnictrl.NewMachineSetController().ControllerName err = createOrUpdate(ctx, st, omni.NewClusterMachineConfigPatches(item.Metadata().Namespace(), item.Metadata().ID()), func(res *omni.ClusterMachineConfigPatches) error { - omnictrl.CopyAllLabels(item, res) + helpers.CopyAllLabels(item, res) machineSet, ok := item.Metadata().Labels().Get(omni.SystemLabelPrefix + "machine-set") if !ok { diff --git a/internal/backend/runtime/omni/omni.go b/internal/backend/runtime/omni/omni.go index 7a1350293..6604a92f0 100644 --- a/internal/backend/runtime/omni/omni.go +++ b/internal/backend/runtime/omni/omni.go @@ -70,6 +70,8 @@ type Runtime struct { } // New creates a new Omni runtime. +// +//nolint:maintidx func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workloadProxyServiceRegistry *workloadproxy.ServiceRegistry, resourceLogger *resourcelogger.Logger, linkCounterDeltaCh <-chan siderolink.LinkCounterDeltas, resourceState state.State, virtualState *virtual.State, metricsRegistry prometheus.Registerer, logger *zap.Logger, @@ -166,7 +168,6 @@ func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workl &omnictrl.LoadBalancerController{}, &omnictrl.MachineSetNodeController{}, &omnictrl.MachineSetDestroyStatusController{}, - &omnictrl.MachineSetStatusController{}, &omnictrl.MachineStatusController{}, omnictrl.NewMachineCleanupController(), omnictrl.NewMachineStatusLinkController(linkCounterDeltaCh), @@ -191,6 +192,7 @@ func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workl omnictrl.NewClusterConfigVersionController(), omnictrl.NewClusterEndpointController(), omnictrl.NewClusterMachineConfigController(config.Config.DefaultConfigGenOptions), + omnictrl.NewClusterMachineTeardownController(), omnictrl.NewMachineConfigGenOptionsController(), omnictrl.NewClusterMachineConfigStatusController(), omnictrl.NewClusterMachineEncryptionKeyController(), @@ -203,6 +205,7 @@ func New(talosClientFactory *talos.ClientFactory, dnsService *dns.Service, workl omnictrl.NewKubernetesUpgradeManifestStatusController(), omnictrl.NewKubernetesUpgradeStatusController(), omnictrl.NewMachineController(), + omnictrl.NewMachineSetStatusController(), omnictrl.NewMachineSetEtcdAuditController(talosClientFactory, time.Minute), omnictrl.NewRedactedClusterMachineConfigController(), omnictrl.NewSecretsController(storeFactory), diff --git a/internal/frontend/frontend.go b/internal/frontend/frontend.go index 79060b51a..98b4939ba 100644 --- a/internal/frontend/frontend.go +++ b/internal/frontend/frontend.go @@ -5,7 +5,7 @@ // THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. // -// Generated on 2024-02-28T11:43:22Z by kres latest. +// Generated on 2024-02-28T17:18:34Z by kres latest. package frontend