From d2e03095c62c6983db82f1228b74886bda6b9be2 Mon Sep 17 00:00:00 2001 From: Pedro Soares Date: Fri, 15 Mar 2024 10:11:11 -0300 Subject: [PATCH] fix(watcher): reset mitigation if total rooms is 1 If the total rooms is 1, we need to allow disruptions in the runtime. Thus, we reset the mitigation to be applied in 0 rooms instead of 1. --- docs/reference/Kubernetes.md | 3 +++ .../adapters/runtime/kubernetes/scheduler.go | 7 ----- .../runtime/kubernetes/scheduler_test.go | 26 +++++-------------- .../runtimewatcher/runtime_watcher_worker.go | 20 +++++++++++--- .../runtime_watcher_worker_test.go | 1 + 5 files changed, 27 insertions(+), 30 deletions(-) diff --git a/docs/reference/Kubernetes.md b/docs/reference/Kubernetes.md index 0fd7caaf4..81f0b83dd 100644 --- a/docs/reference/Kubernetes.md +++ b/docs/reference/Kubernetes.md @@ -58,6 +58,9 @@ runtimeWatcher: safetyPercentage: 0.05 ``` +We should always allow node consolidation to happen, thus if the total number of rooms +is 1, then we set PDB's `minAvailable: 0`. + #### Pod Change Events Worker For this type of worker, runtime watcher spawns multiple goroutines, maintaining a worker diff --git a/internal/adapters/runtime/kubernetes/scheduler.go b/internal/adapters/runtime/kubernetes/scheduler.go index deb5e4914..e7ed7f4f1 100644 --- a/internal/adapters/runtime/kubernetes/scheduler.go +++ b/internal/adapters/runtime/kubernetes/scheduler.go @@ -108,13 +108,6 @@ func (k *kubernetes) createPDBFromScheduler(ctx context.Context, scheduler *enti }, } - if scheduler.Autoscaling != nil && scheduler.Autoscaling.Enabled { - pdbSpec.Spec.MinAvailable = &intstr.IntOrString{ - Type: intstr.Int, - IntVal: int32(scheduler.Autoscaling.Min), - } - } - pdb, err := k.clientSet.PolicyV1().PodDisruptionBudgets(scheduler.Name).Create(ctx, pdbSpec, metav1.CreateOptions{}) if err != nil && !kerrors.IsAlreadyExists(err) { k.logger.Warn("error creating pdb", zap.String("scheduler", scheduler.Name), zap.Error(err)) diff --git a/internal/adapters/runtime/kubernetes/scheduler_test.go b/internal/adapters/runtime/kubernetes/scheduler_test.go index 79c1a0fc2..0d223c2c4 100644 --- a/internal/adapters/runtime/kubernetes/scheduler_test.go +++ b/internal/adapters/runtime/kubernetes/scheduler_test.go @@ -128,7 +128,7 @@ func TestPDBCreationAndDeletion(t *testing.T) { require.Contains(t, pdb.Labels["app.kubernetes.io/managed-by"], "maestro") }) - t.Run("create pdb from scheduler with autoscaling", func(t *testing.T) { + t.Run("pdb should not use scheduler min as minAvailable", func(t *testing.T) { if !kubernetesRuntime.isPDBSupported() { t.Log("Kubernetes version does not support PDB, skipping") t.SkipNow() @@ -166,7 +166,7 @@ func TestPDBCreationAndDeletion(t *testing.T) { require.NoError(t, err) require.NotNil(t, pdb) require.Equal(t, pdb.Name, scheduler.Name) - require.Equal(t, pdb.Spec.MinAvailable.IntVal, int32(2)) + require.Equal(t, pdb.Spec.MinAvailable.IntVal, int32(0)) }) t.Run("delete pdb on scheduler deletion", func(t *testing.T) { @@ -233,7 +233,7 @@ func TestMitigateDisruption(t *testing.T) { require.Equal(t, pdb.Spec.MinAvailable.IntVal, int32(0)) }) - t.Run("should update PDB on mitigatation if not equal to current minAvailable", func(t *testing.T) { + t.Run("should update PDB on mitigation if not equal to current value", func(t *testing.T) { if !kubernetesRuntime.isPDBSupported() { t.Log("Kubernetes version does not support PDB, skipping") t.SkipNow() @@ -241,19 +241,6 @@ func TestMitigateDisruption(t *testing.T) { scheduler := &entities.Scheduler{ Name: "scheduler-pdb-mitigation-update", - Autoscaling: &autoscaling.Autoscaling{ - Enabled: true, - Min: 100, - Max: 200, - Policy: autoscaling.Policy{ - Type: autoscaling.RoomOccupancy, - Parameters: autoscaling.PolicyParameters{ - RoomOccupancy: &autoscaling.RoomOccupancyParams{ - ReadyTarget: 0.1, - }, - }, - }, - }, } err := kubernetesRuntime.CreateScheduler(ctx, scheduler) if err != nil { @@ -271,9 +258,10 @@ func TestMitigateDisruption(t *testing.T) { require.NoError(t, err) require.NotNil(t, pdb) require.Equal(t, pdb.Name, scheduler.Name) - require.Equal(t, pdb.Spec.MinAvailable.IntVal, int32(scheduler.Autoscaling.Min)) + require.Equal(t, pdb.Spec.MinAvailable.IntVal, 0) - err = kubernetesRuntime.MitigateDisruption(ctx, scheduler, scheduler.Autoscaling.Min, 0.0) + occupiedRooms := 100 + err = kubernetesRuntime.MitigateDisruption(ctx, scheduler, occupiedRooms, 0.0) require.NoError(t, err) pdb, err = client.PolicyV1().PodDisruptionBudgets(scheduler.Name).Get(ctx, scheduler.Name, metav1.GetOptions{}) @@ -282,7 +270,7 @@ func TestMitigateDisruption(t *testing.T) { require.Equal(t, pdb.Name, scheduler.Name) incSafetyPercentage := 1.0 + DefaultDisruptionSafetyPercentage - newRoomAmount := int32(float64(scheduler.Autoscaling.Min) * incSafetyPercentage) + newRoomAmount := int32(float64(occupiedRooms) * incSafetyPercentage) require.Equal(t, pdb.Spec.MinAvailable.IntVal, newRoomAmount) }) diff --git a/internal/core/worker/runtimewatcher/runtime_watcher_worker.go b/internal/core/worker/runtimewatcher/runtime_watcher_worker.go index 95c903d40..c32dc4f25 100644 --- a/internal/core/worker/runtimewatcher/runtime_watcher_worker.go +++ b/internal/core/worker/runtimewatcher/runtime_watcher_worker.go @@ -101,16 +101,28 @@ func (w *runtimeWatcherWorker) spawnUpdateRoomWatchers(resultChan chan game_room } func (w *runtimeWatcherWorker) mitigateDisruptions() error { - occupiedRoomsAmount, err := w.roomStorage.GetRoomCountByStatus(w.ctx, w.scheduler.Name, game_room.GameStatusOccupied) + totalRoomsAmount, err := w.roomStorage.GetRoomCount(w.ctx, w.scheduler.Name) if err != nil { w.logger.Error( - "failed to get occupied rooms for scheduler", + "failed to get total rooms amount for scheduler", zap.String("scheduler", w.scheduler.Name), zap.Error(err), ) return err } - err = w.runtime.MitigateDisruption(w.ctx, w.scheduler, occupiedRoomsAmount, w.config.DisruptionSafetyPercentage) + mitigateForRoomsAmount := 0 + if totalRoomsAmount >= 2 { + mitigateForRoomsAmount, err = w.roomStorage.GetRoomCountByStatus(w.ctx, w.scheduler.Name, game_room.GameStatusOccupied) + if err != nil { + w.logger.Error( + "failed to get occupied rooms for scheduler", + zap.String("scheduler", w.scheduler.Name), + zap.Error(err), + ) + return err + } + } + err = w.runtime.MitigateDisruption(w.ctx, w.scheduler, mitigateForRoomsAmount, w.config.DisruptionSafetyPercentage) if err != nil { w.logger.Error( "failed to mitigate disruption", @@ -122,7 +134,7 @@ func (w *runtimeWatcherWorker) mitigateDisruptions() error { w.logger.Debug( "mitigated disruption for occupied rooms", zap.String("scheduler", w.scheduler.Name), - zap.Int("occupiedRooms", occupiedRoomsAmount), + zap.Int("mitigateForRoomsAmount", mitigateForRoomsAmount), ) return nil diff --git a/internal/core/worker/runtimewatcher/runtime_watcher_worker_test.go b/internal/core/worker/runtimewatcher/runtime_watcher_worker_test.go index 0c0434405..a04b0fb69 100644 --- a/internal/core/worker/runtimewatcher/runtime_watcher_worker_test.go +++ b/internal/core/worker/runtimewatcher/runtime_watcher_worker_test.go @@ -264,6 +264,7 @@ func TestRuntimeWatcher_Start(t *testing.T) { runtimeWatcher.EXPECT().Stop() runtime.EXPECT().MitigateDisruption(gomock.Any(), gomock.Any(), 0, 0.0).Return(nil).MinTimes(0) + roomStorage.EXPECT().GetRoomCount(gomock.Any(), gomock.Any()).Return(2, nil).MinTimes(0) roomStorage.EXPECT().GetRoomCountByStatus(gomock.Any(), gomock.Any(), gomock.Any()).Return(0, nil).MinTimes(0) ctx, cancelFunc := context.WithCancel(context.Background())