diff --git a/docs/reference/RollingUpdate.md b/docs/reference/RollingUpdate.md index 6ff30e368..939eb6ad8 100644 --- a/docs/reference/RollingUpdate.md +++ b/docs/reference/RollingUpdate.md @@ -75,10 +75,10 @@ version in the database that is not from the active scheduler version 6. If it does not have, run normal autoscale. If it has, it is performing a rolling update, proceed with the update -7. The update checks how many rooms it can spawn by computing the below -``` -maxSurge * totalAvailableRooms (pending + unready + ready + occupied) -``` +7. The update checks how many rooms it can spawn by computing the below: + 1. Calculate the current `deviation` (`totalRoomsAmount - desiredAmountOfRooms`); + 2. Calculate the `surge` amount over the desired (`surge = (maxSurge / 100) * desiredAmountOfRooms`); + 3. Discount the deviation from the surge (`surge = surge - deviation`); 8. Enqueues a priority _add_room_ operation to create the surge amount 9. Check how many old rooms it can delete by computing ``` @@ -118,55 +118,28 @@ throughout the cycle and rolling update will adjust to that as well. | **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | |----------|-----------|--------------|---------------|-------------|------------------|-------------|-----------------| -| **1** | 20 | 5 | 25 (0 new) | 10 | 5 | 7 | 15 | -| **2** | 12 | 5 | 17 (7 new) | 10 | 5 | 4 | 7 | -| **3** | 9 | 5 | 14 (11 new) | 10 | 5 | 3 | 3 (4 actually) | -| **4** | 9 | 5 | 14 (14 new) | 10 | 5 | - | 4 by autoscale | -| **5** | 5 | 5 | 10 (10 new) | 10 | 5 | - | - | +| **1** | 20 | 5 | 25 (0 new) | 10 | 5 | 0 | 15 | +| **2** | 5 | 5 | 10 (0 new) | 10 | 5 | 2 | 0 | +| **3** | 7 | 5 | 12 (2 new) | 10 | 5 | 0 | 2 | +| **4** | 5 | 5 | 10 (2 new) | 10 | 5 | 2 | 0 | +| **5** | 7 | 5 | 12 (4 new) | 10 | 5 | 0 | 2 | +| **6** | 5 | 5 | 10 (4 new) | 10 | 5 | 2 | 0 | +| **7** | 7 | 5 | 12 (6 new) | 10 | 5 | 0 | 2 | +| **8** | 5 | 5 | 10 (6 new) | 10 | 5 | 2 | 0 | +| **9** | 7 | 5 | 12 (8 new) | 10 | 5 | 0 | 2 | +| **10** | 5 | 5 | 10 (8 new) | 10 | 5 | 2 | 0 | +| **11** | 7 | 5 | 12 (10 new) | 10 | 5 | - | 2 | +| **12** | 5 | 5 | 10 (10 new) | 10 | 5 | - | - | ### Upscale | **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | |----------|-----------|--------------|---------------|-------------|------------------|-------------|-----------------| -| **1** | 5 | 20 | 25 (0 new) | 40 | 20 | 7 | 0 | -| **2** | 12 | 20 | 32 (7 new) | 40 | 20 | 8 | 0 | -| **3** | 20 | 20 | 40 (15 new) | 40 | 20 | 10 | 0 | -| **4** | 30 | 20 | 50 (25 new) | 40 | 20 | 13 | 10 | -| **5** | 33 | 20 | 53 (38 new) | 40 | 20 | 14 | 13 | -| **6** | 34 | 20 | 54 (52 new) | 40 | 20 | 14 | 2 (actual 14) | -| **7** | 46 | 20 | 66 (66 new) | 40 | 20 | - | 26 by autoscale | -| **8** | 20 | 20 | 40 (40 new) | 40 | 20 | - | - | - -## Big Amount of Game Rooms - -### Upscale - -* readyTarget: 0.4 -> 0.7 -* maxSurge: 25% - -| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | -|----------|-----------|--------------|-----------------|-------------|------------------|-------------|------------------| -| **1** | 209 | 458 | 667 (0 new) | 1526 | 1068 | 167 | 0 | -| **2** | 376 | 458 | 834 (167 new) | 1526 | 1068 | 209 | 0 | -| **3** | 585 | 458 | 1043 (376 new) | 1526 | 1068 | 261 | 0 | -| **4** | 846 | 458 | 1304 (637 new) | 1526 | 1068 | 326 | 0 | -| **5** | 1172 | 458 | 1630 (963 new) | 1526 | 1068 | 408 | 104 | -| **6** | 1476 | 458 | 1934 (1371 new) | 1526 | 1068 | 484 | 408 | -| **7** | 1552 | 458 | 2010 (1855 new) | 1526 | 1068 | 503 | 155 | -| **8** | 2055 | 458 | 2513 (2513 new) | 1526 | 1068 | - | 987 by autoscale | -| **9** | 1068 | 458 | 1526 (1526 new) | 1526 | 1068 | - | - | - -### Downscale - -* readyTarget: 0.7 -> 0.4 -* maxSurge: 25% - -| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | -|----------|-----------|--------------|-----------------|-------------|------------------|-------------|-------------------| -| **1** | 940 | 1040 | 1980 (0 new) | 1733 | 693 | 495 | 247 | -| **2** | 1188 | 1040 | 2228 (495 new) | 1733 | 693 | 557 | 495 | -| **3** | 1250 | 1040 | 2290 (1052 new) | 1733 | 693 | 573 | 557 | -| **4** | 1266 | 1040 | 2306 (1625 new) | 1733 | 693 | 577 | 573 | -| **5** | 1270 | 1040 | 2310 (2202 new) | 1733 | 693 | 578 | 108 (577) | -| **6** | 1740 | 1040 | 2780 (2780 new) | 1733 | 693 | - | 1047 by autoscale | -| **7** | 693 | 1040 | 1733 (1733 new) | 1733 | 693 | - | - | +| **1** | 5 | 20 | 25 (0 new) | 40 | 20 | 10 | 0 | +| **2** | 15 | 20 | 35 (10 new) | 40 | 20 | 10 | 0 | +| **3** | 25 | 20 | 45 (20 new) | 40 | 20 | 5 | 5 | +| **4** | 25 | 20 | 45 (25 new) | 40 | 20 | 5 | 5 | +| **5** | 25 | 20 | 45 (30 new) | 40 | 20 | 5 | 5 | +| **6** | 35 | 20 | 45 (35 new) | 40 | 20 | 5 | 5 | +| **7** | 25 | 20 | 45 (40 new) | 40 | 20 | 0 | 5 | +| **8** | 20 | 20 | 40 (40 new) | 40 | 20 | 0 | 0 | diff --git a/internal/core/operations/healthcontroller/executor.go b/internal/core/operations/healthcontroller/executor.go index d33509a46..7086b6a47 100644 --- a/internal/core/operations/healthcontroller/executor.go +++ b/internal/core/operations/healthcontroller/executor.go @@ -25,6 +25,9 @@ package healthcontroller import ( "context" "fmt" + "math" + "strconv" + "strings" "time" "github.com/topfreegames/maestro/internal/core/operations/rooms/add" @@ -40,6 +43,10 @@ import ( "go.uber.org/zap" ) +const ( + schedulerMaxSurgeRelativeSymbol = "%" +) + // Config have the configs to execute healthcontroller. type Config struct { RoomInitializationTimeout time.Duration @@ -127,8 +134,13 @@ func (ex *Executor) Execute(ctx context.Context, op *operation.Operation, defini // Check if the system is in a rollingUpdate by listing rooms that are not the current scheduler version roomsPreviousSchedulerVersion, isRollingUpdate := ex.checkRollingUpdate(ctx, logger, scheduler, availableRooms) if isRollingUpdate { - return ex.performRollingUpdate(ctx, op, def, logger, scheduler, desiredNumberOfRooms, availableRooms, roomsPreviousSchedulerVersion) + err = ex.performRollingUpdate(ctx, op, def, logger, scheduler, desiredNumberOfRooms, availableRooms, roomsPreviousSchedulerVersion) + if err != nil { + logger.Error("could not perform rolling update", zap.Error(err)) + // Rolling update should not block the health controller + } } + err = ex.ensureDesiredAmountOfInstances(ctx, op, def, scheduler, logger, len(availableRooms), desiredNumberOfRooms) if err != nil { logger.Error("cannot ensure desired amount of instances", zap.Error(err)) @@ -426,19 +438,18 @@ func (ex *Executor) performRollingUpdate( roomsWithPreviousSchedulerVersion []string, ) error { logger.Info("performing rolling update", zap.String("scheduler.Version", scheduler.Spec.Version)) - maxSurgeAmount, err := ex.roomManager.SchedulerMaxSurge(ctx, scheduler) + maxSurgeAmount, err := ComputeRollingSurge(scheduler, desiredNumberOfRooms, len(availableRoomsIDs)) if err != nil { logger.Error("failed to perform rolling update while getting max surge amount of rooms", zap.Error(err)) return err } + if len(roomsWithPreviousSchedulerVersion) < maxSurgeAmount { maxSurgeAmount = len(roomsWithPreviousSchedulerVersion) } - if maxSurgeAmount <= 0 { - maxSurgeAmount = 1 - } + logger.Info( - "upscaling new rooms", + "up scaling new rooms", zap.Int("desired", desiredNumberOfRooms), zap.Int("maxSurgeAmount", maxSurgeAmount), zap.Int("available", len(availableRoomsIDs)), @@ -521,3 +532,22 @@ func (ex *Executor) markPreviousSchedulerRoomsForDeletion( logger.Sugar().Infof("successfully marked %d rooms for deletion", bufferRoomsToBeRemoved) return roomsWithPreviousSchedulerVersion[:bufferRoomsToBeRemoved], nil } + +func ComputeRollingSurge(scheduler *entities.Scheduler, desiredNumberOfRooms, totalRoomsAmount int) (maxSurgeNum int, err error) { + if scheduler.MaxSurge != "" { + isRelative := strings.HasSuffix(scheduler.MaxSurge, schedulerMaxSurgeRelativeSymbol) + maxSurgeNum, err = strconv.Atoi(strings.TrimSuffix(scheduler.MaxSurge, schedulerMaxSurgeRelativeSymbol)) + if err != nil { + return 0, fmt.Errorf("failed to parse max surge into a number: %w", err) + } + if isRelative { + maxSurgeNum = int(math.Round((float64(maxSurgeNum) / 100) * float64(desiredNumberOfRooms))) + } + } + + // Capping the amount of rooms to surge + deviation := int(math.Max(float64(totalRoomsAmount-desiredNumberOfRooms), 0)) + maxSurgeNum = int(math.Max(float64(maxSurgeNum-deviation), 0)) + + return maxSurgeNum, nil +} diff --git a/internal/core/operations/healthcontroller/executor_test.go b/internal/core/operations/healthcontroller/executor_test.go index 5f661bfab..046ec88fd 100644 --- a/internal/core/operations/healthcontroller/executor_test.go +++ b/internal/core/operations/healthcontroller/executor_test.go @@ -1370,9 +1370,8 @@ func TestSchedulerHealthController_Execute(t *testing.T) { op := operation.New(newScheduler.Name, definition.Name(), nil) // Perform rolling update - roomManager.EXPECT().SchedulerMaxSurge(gomock.Any(), newScheduler).Return(1, nil) - operationManager.EXPECT().CreatePriorityOperation(gomock.Any(), newScheduler.Name, &add.Definition{Amount: 1}).Return(op, nil) - operationManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), gomock.Any(), gomock.Any()).Times(2) + operationManager.EXPECT().CreatePriorityOperation(gomock.Any(), newScheduler.Name, &add.Definition{Amount: 0}).Return(op, nil) + operationManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), gomock.Any(), gomock.Any()).Times(3) roomStorage.EXPECT().GetRoomIDsByStatus(gomock.Any(), newScheduler.Name, game_room.GameStatusOccupied).Return(gameRoomIDs, nil) roomStorage.EXPECT().GetRoomIDsByStatus(gomock.Any(), newScheduler.Name, game_room.GameStatusReady).Return(gameRoomIDs, nil) operationManager.EXPECT().CreateOperation(gomock.Any(), newScheduler.Name, &remove.Definition{RoomsIDs: gameRoomIDs, Reason: remove.RollingUpdateReplace}).Return(op, nil) @@ -1467,3 +1466,85 @@ func newValidScheduler(autoscaling *autoscaling.Autoscaling) *entities.Scheduler Autoscaling: autoscaling, } } + +func TestComputeRollingSurgeVariants(t *testing.T) { + testCases := []struct { + name string + maxSurge string + desiredNumberOfRooms int + totalRoomsAmount int + expectedSurgeAmount int + expectError bool + }{ + { + name: "Standard surge calculation", + maxSurge: "10", + desiredNumberOfRooms: 20, + totalRoomsAmount: 15, + expectedSurgeAmount: 10, + expectError: false, + }, + { + name: "High amount of rooms above 1000", + maxSurge: "30%", + desiredNumberOfRooms: 1500, + totalRoomsAmount: 1000, + expectedSurgeAmount: 450, // It can surge up to 950 but we cap to the maxSurge of 450 + expectError: false, + }, + { + name: "Relative maxSurge with rooms above 1000", + maxSurge: "25%", + desiredNumberOfRooms: 1200, + totalRoomsAmount: 1000, + expectedSurgeAmount: 300, // 25% of 1200 is 300 + expectError: false, + }, + { + name: "Do not replace rooms during aggressive downscaling", + maxSurge: "25%", + desiredNumberOfRooms: 1200, + totalRoomsAmount: 1500, + expectedSurgeAmount: 0, + expectError: false, + }, + { + name: "Respect the surge when downscaling", + maxSurge: "25%", + desiredNumberOfRooms: 1200, + totalRoomsAmount: 1300, + expectedSurgeAmount: 200, + expectError: false, + }, + { + name: "Invalid maxSurge string format", + maxSurge: "twenty%", // Invalid because it's not a number + desiredNumberOfRooms: 300, + totalRoomsAmount: 250, + expectedSurgeAmount: 1, // Expected to be 1 because the function should return an error + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + scheduler := &entities.Scheduler{ + MaxSurge: tc.maxSurge, + } + + surgeAmount, err := healthcontroller.ComputeRollingSurge(scheduler, tc.desiredNumberOfRooms, tc.totalRoomsAmount) + if tc.expectError { + if err == nil { + t.Errorf("expected an error but got none") + } + } else { + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if surgeAmount != tc.expectedSurgeAmount { + t.Errorf("expected surge amount to be %d, but got %d", tc.expectedSurgeAmount, surgeAmount) + } + } + }) + } +}