From 46c4dcc4930ceeb24589e714deaaf3a15de3fc87 Mon Sep 17 00:00:00 2001 From: Guilherme Oliveira do Carmo Carvalho <33766735+guilhermocc@users.noreply.github.com> Date: Tue, 9 Aug 2022 11:59:51 -0300 Subject: [PATCH] Add retry mechanism to room validation logic (#512) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Delete specific execution errors, to maintain error history responsibility to each operation executor * Update new scheduler version executor to populate its execution history according to the validation error * Update operation executor methods signatures to use error interface instead of a custom one * Remove commented lines * Remove useless if * Fix messages typos * Remove useless error checking * Remove useless error type * Add retry mechanism to room validation logic * Resolve rebase conflicts * Revert swagger changes * Update internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go Co-authored-by: Arthur Gonçalves Co-authored-by: Arthur Gonçalves --- config/config.yaml | 1 + .../delete_scheduler_executor_test.go | 4 +- .../newschedulerversion/messages.go | 16 +- .../new_scheduler_version_executor.go | 28 ++- .../new_scheduler_version_executor_test.go | 178 +++++++++++++++--- internal/service/config.go | 6 + 6 files changed, 189 insertions(+), 44 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 6b4ce1ae5..1f68bd75d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -60,6 +60,7 @@ services: roomPingTimeoutMillis: 240000 roomInitializationTimeoutMillis: 120000 roomDeletionTimeoutMillis: 120000 + roomValidationAttempts: 3 operationManager: operationLeaseTTLMillis: 5000 eventsForwarder: diff --git a/internal/core/operations/deletescheduler/delete_scheduler_executor_test.go b/internal/core/operations/deletescheduler/delete_scheduler_executor_test.go index ed31e8954..026cbe90b 100644 --- a/internal/core/operations/deletescheduler/delete_scheduler_executor_test.go +++ b/internal/core/operations/deletescheduler/delete_scheduler_executor_test.go @@ -283,7 +283,7 @@ func TestDeleteSchedulerExecutor_Execute(t *testing.T) { Return(errors.New("some error on storage")) err := executor.Execute(ctx, op, definition) - require.Equal(t, errors.New("some error on storage"), err) + require.EqualError(t, err, "some error on storage") }) t.Run("when it fails to delete scheduler in runtime", func(t *testing.T) { @@ -303,7 +303,7 @@ func TestDeleteSchedulerExecutor_Execute(t *testing.T) { runtime.EXPECT().DeleteScheduler(ctx, scheduler).Return(errors.New("some error on runtime")) err := executor.Execute(ctx, op, definition) - require.Equal(t, errors.New("some error on runtime"), err) + require.EqualError(t, err, "some error on runtime") }) }) } diff --git a/internal/core/operations/newschedulerversion/messages.go b/internal/core/operations/newschedulerversion/messages.go index c9ec6cfe0..a6a5139fd 100644 --- a/internal/core/operations/newschedulerversion/messages.go +++ b/internal/core/operations/newschedulerversion/messages.go @@ -23,11 +23,19 @@ package newschedulerversion const ( - validationTimeoutMessageTemplate = `The GRU could not be validated. Maestro got timeout waiting for the GRU with ID: %s to be ready. You can check if - the GRU image is stable on its logs. If you could not spot any issues, contact the Maestro's responsible team for helping.` + startingValidationMessageTemplate = "Major version detected, starting game room validation process..." - validationPodInErrorMessageTemplate = `The GRU could not be validated. The room created for validation with ID %s is entering in error state. You can check if + enqueuedSwitchVersionMessageTemplate = "Enqueued switch active version operation with id: %s" + + validationSuccessMessageTemplate = "%dº Attempt: Game room validation success!" + + allAttemptsFailedMessageTemplate = "All validation attempts have failed, operation aborted!" + + validationTimeoutMessageTemplate = `%dº Attempt: Got timeout waiting for the GRU with ID: %s to be ready. You can check if + the GRU image is stable on its logs.` + + validationPodInErrorMessageTemplate = `%dº Attempt: The room created for validation with ID %s is entering in error state. You can check if the GRU image is stable on its logs using the provided room id. Last event in the game room: %s.` - validationUnexpectedErrorMessageTemplate = `The GRU could not be validated. Unexpected Error: %s - Contact the Maestro's responsible team for helping.` + validationUnexpectedErrorMessageTemplate = `%dº Attempt: Unexpected Error: %s - Contact the Maestro's responsible team for helping.` ) diff --git a/internal/core/operations/newschedulerversion/new_scheduler_version_executor.go b/internal/core/operations/newschedulerversion/new_scheduler_version_executor.go index 5897a91b3..d82e43567 100644 --- a/internal/core/operations/newschedulerversion/new_scheduler_version_executor.go +++ b/internal/core/operations/newschedulerversion/new_scheduler_version_executor.go @@ -28,6 +28,8 @@ import ( "fmt" "time" + "github.com/avast/retry-go/v4" + serviceerrors "github.com/topfreegames/maestro/internal/core/services/errors" "github.com/topfreegames/maestro/internal/core/entities/game_room" @@ -49,6 +51,7 @@ import ( // Config defines configurations for the CreateNewSchedulerVersionExecutor. type Config struct { RoomInitializationTimeout time.Duration + RoomValidationAttempts int } // CreateNewSchedulerVersionExecutor holds the dependecies to execute the operation to create a new scheduler version. @@ -101,9 +104,17 @@ func (ex *CreateNewSchedulerVersionExecutor) Execute(ctx context.Context, op *op } if isSchedulerMajorVersion { - validationError := ex.validateGameRoomCreation(ctx, newScheduler, logger) - if ex.treatValidationError(ctx, op, validationError) != nil { - return validationError + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, startingValidationMessageTemplate) + currentAttempt := 0 + retryError := retry.Do(func() error { + currentAttempt++ + validationError := ex.validateGameRoomCreation(ctx, newScheduler, logger) + return ex.treatValidationError(ctx, op, validationError, currentAttempt) + }, retry.Attempts(uint(ex.config.RoomValidationAttempts))) + if retryError != nil { + logger.Error("game room validation failed after all attempts", zap.Error(retryError)) + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, allAttemptsFailedMessageTemplate) + return retryError } } @@ -112,7 +123,7 @@ func (ex *CreateNewSchedulerVersionExecutor) Execute(ctx context.Context, op *op return err } - ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(enqueuedSwitchVersionMessageTemplate, switchOpID)) logger.Sugar().Infof("new scheduler version created: %s, is major: %t", newScheduler.Spec.Version, isSchedulerMajorVersion) logger.Sugar().Infof("%s operation succeded, %s operation enqueued to continue scheduler update process, switching to version %s", opDef.Name(), switch_active_version.OperationName, newScheduler.Spec.Version) return nil @@ -206,20 +217,21 @@ func (ex *CreateNewSchedulerVersionExecutor) RemoveValidationRoomID(schedulerNam delete(ex.validationRoomIdsMap, schedulerName) } -func (ex *CreateNewSchedulerVersionExecutor) treatValidationError(ctx context.Context, op *operation.Operation, validationError error) error { +func (ex *CreateNewSchedulerVersionExecutor) treatValidationError(ctx context.Context, op *operation.Operation, validationError error, currentAttempt int) error { switch { case errors.Is(validationError, &ValidationPodInErrorError{}): err := validationError.(*ValidationPodInErrorError) - ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationPodInErrorMessageTemplate, err.GameRoomID, err.StatusDescription)) + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationPodInErrorMessageTemplate, currentAttempt, err.GameRoomID, err.StatusDescription)) return validationError case errors.Is(validationError, &ValidationTimeoutError{}): - ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationTimeoutMessageTemplate, validationError.(*ValidationTimeoutError).GameRoom.ID)) + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationTimeoutMessageTemplate, currentAttempt, validationError.(*ValidationTimeoutError).GameRoom.ID)) return validationError case validationError != nil: - ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationUnexpectedErrorMessageTemplate, validationError.Error())) + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationUnexpectedErrorMessageTemplate, currentAttempt, validationError.Error())) return validationError } + ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationSuccessMessageTemplate, currentAttempt)) return nil } diff --git a/internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go b/internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go index f81090307..38d31eecb 100644 --- a/internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go +++ b/internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go @@ -72,6 +72,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -93,7 +94,62 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Game room validation success!") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) + + result := executor.Execute(context.Background(), op, operationDef) + + require.Nil(t, result) + }) + + t.Run("should succeed - major version update, game room is valid, validation succeeds in the configured max attempt, returns no error -> enqueue switch active version op", func(t *testing.T) { + mockCtrl := gomock.NewController(t) + + currentActiveScheduler := newValidSchedulerWithImageVersion("image-v1") + newScheduler := *newValidSchedulerWithImageVersion("image-v2") + newSchedulerExpectedVersion := "v2.0.0" + op := &operation.Operation{ + ID: "123", + Status: operation.StatusInProgress, + DefinitionName: newschedulerversion.OperationName, + SchedulerName: newScheduler.Name, + } + operationDef := &newschedulerversion.CreateNewSchedulerVersionDefinition{NewScheduler: &newScheduler} + roomManager := mockports.NewMockRoomManager(mockCtrl) + schedulerManager := mockports.NewMockSchedulerManager(mockCtrl) + operationsManager := mockports.NewMockOperationManager(mockCtrl) + switchOpID := "switch-active-version-op-id" + config := newschedulerversion.Config{ + RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 3, + } + + executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) + + schedulerVersions := []*entities.SchedulerVersion{{Version: "v1.0.0"}, {Version: "v1.1.0"}, {Version: "v1.2.0"}} + gameRoom := &game_room.GameRoom{ID: "id-1"} + + roomManager.EXPECT().CreateRoom(gomock.Any(), gomock.Any(), true).Return(nil, nil, errors.NewErrUnexpected("some error")).Times(2) + roomManager.EXPECT().CreateRoom(gomock.Any(), gomock.Any(), true).Return(gameRoom, nil, nil).Times(1) + roomManager.EXPECT().WaitRoomStatus(gomock.Any(), gameRoom, []game_room.GameRoomStatus{game_room.GameStatusReady, game_room.GameStatusError}).Return(game_room.GameStatusReady, nil) + roomManager.EXPECT().DeleteRoom(gomock.Any(), gomock.Any()).Return(nil) + + schedulerManager. + EXPECT(). + CreateNewSchedulerVersionAndEnqueueSwitchVersion(gomock.Any(), gomock.Any()). + DoAndReturn( + func(ctx context.Context, scheduler *entities.Scheduler) (string, error) { + require.Equal(t, newSchedulerExpectedVersion, scheduler.Spec.Version) + return switchOpID, nil + }) + schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) + schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Unexpected Error: some error - Contact the Maestro's responsible team for helping.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "2º Attempt: Unexpected Error: some error - Contact the Maestro's responsible team for helping.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "3º Attempt: Game room validation success!") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -119,6 +175,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -140,7 +197,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Game room validation success!") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -166,6 +225,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -187,7 +247,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Game room validation success!") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -213,6 +275,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -234,7 +297,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Game room validation success!") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -258,6 +323,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -289,6 +355,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { schedulerVersions := []*entities.SchedulerVersion{{Version: "v-----"}} config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -319,6 +386,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -331,11 +399,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory( - gomock.Any(), - op, - `The GRU could not be validated. Unexpected Error: error creating test game room - Contact the Maestro's responsible team for helping.`, - ) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Unexpected Error: error creating test game room - Contact the Maestro's responsible team for helping.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "All validation attempts have failed, operation aborted!") roomManager.EXPECT().CreateRoom(gomock.Any(), gomock.Any(), true).Return(nil, nil, fmt.Errorf("error creating test game room")) @@ -344,6 +410,52 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { require.ErrorContains(t, operationExecutionError, "error creating test game room") }) + t.Run("should fail - major version update, game room is invalid, validation fails in all attempts", func(t *testing.T) { + mockCtrl := gomock.NewController(t) + + currentActiveScheduler := newValidSchedulerWithImageVersion("image-v1") + newScheduler := *newValidSchedulerWithImageVersion("image-v2") + op := &operation.Operation{ + ID: "123", + Status: operation.StatusInProgress, + DefinitionName: newschedulerversion.OperationName, + SchedulerName: newScheduler.Name, + } + operationDef := &newschedulerversion.CreateNewSchedulerVersionDefinition{NewScheduler: &newScheduler} + roomManager := mockports.NewMockRoomManager(mockCtrl) + schedulerManager := mockports.NewMockSchedulerManager(mockCtrl) + operationsManager := mockports.NewMockOperationManager(mockCtrl) + config := newschedulerversion.Config{ + RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 3, + } + + executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) + schedulerVersions := []*entities.SchedulerVersion{{Version: "v2.0.0"}, {Version: "v3.1.0"}, {Version: "v1.2.0"}} + + newSchedulerWithNewVersion := newScheduler + newSchedulerWithNewVersion.Spec.Version = "v2.0.0" + newSchedulerWithNewVersion.RollbackVersion = "v1.0.0" + gameRoom := &game_room.GameRoom{ID: "id-1", SchedulerID: "some-scheduler"} + + roomManager.EXPECT().CreateRoom(gomock.Any(), gomock.Any(), true).Return(gameRoom, nil, nil).Times(3) + roomManager.EXPECT().WaitRoomStatus(gomock.Any(), gameRoom, []game_room.GameRoomStatus{game_room.GameStatusReady, game_room.GameStatusError}).Return(game_room.GameStatusReady, serviceerrors.NewErrGameRoomStatusWaitingTimeout("some error")).Times(3) + roomManager.EXPECT().DeleteRoom(gomock.Any(), gameRoom).Return(nil).Times(3) + + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Got timeout waiting for the GRU with ID: id-1 to be ready. You can check if\n\t\tthe GRU image is stable on its logs.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "2º Attempt: Got timeout waiting for the GRU with ID: id-1 to be ready. You can check if\n\t\tthe GRU image is stable on its logs.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "3º Attempt: Got timeout waiting for the GRU with ID: id-1 to be ready. You can check if\n\t\tthe GRU image is stable on its logs.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "All validation attempts have failed, operation aborted!") + + schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) + schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) + + operationExecutionError := executor.Execute(context.Background(), op, operationDef) + + require.ErrorContains(t, operationExecutionError, "error validating game room with ID") + }) + t.Run("should fail - major version update, game room is invalid, timeout error -> returns error, don't create new version/switch to it", func(t *testing.T) { mockCtrl := gomock.NewController(t) @@ -361,6 +473,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -374,12 +487,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { roomManager.EXPECT().CreateRoom(gomock.Any(), gomock.Any(), true).Return(gameRoom, nil, nil) roomManager.EXPECT().WaitRoomStatus(gomock.Any(), gameRoom, []game_room.GameRoomStatus{game_room.GameStatusReady, game_room.GameStatusError}).Return(game_room.GameStatusReady, serviceerrors.NewErrGameRoomStatusWaitingTimeout("some error")) roomManager.EXPECT().DeleteRoom(gomock.Any(), gameRoom).Return(nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory( - gomock.Any(), - op, - `The GRU could not be validated. Maestro got timeout waiting for the GRU with ID: id-1 to be ready. You can check if - the GRU image is stable on its logs. If you could not spot any issues, contact the Maestro's responsible team for helping.`, - ) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: Got timeout waiting for the GRU with ID: id-1 to be ready. You can check if\n\t\tthe GRU image is stable on its logs.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "All validation attempts have failed, operation aborted!") schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) @@ -406,6 +516,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -426,12 +537,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { roomManager.EXPECT().WaitRoomStatus(gomock.Any(), gameRoom, []game_room.GameRoomStatus{game_room.GameStatusReady, game_room.GameStatusError}).Return(game_room.GameStatusError, nil) roomManager.EXPECT().DeleteRoom(gomock.Any(), gameRoom).Return(nil) roomManager.EXPECT().GetRoomInstance(gomock.Any(), gameRoom.SchedulerID, gameRoom.ID).Return(roomInstance, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory( - gomock.Any(), - op, - `The GRU could not be validated. The room created for validation with ID id-1 is entering in error state. You can check if - the GRU image is stable on its logs using the provided room id. Last event in the game room: pod in Crashloop.`, - ) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: The room created for validation with ID id-1 is entering in error state. You can check if\n\t\tthe GRU image is stable on its logs using the provided room id. Last event in the game room: pod in Crashloop.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "All validation attempts have failed, operation aborted!") schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) @@ -458,6 +566,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -472,12 +581,9 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { roomManager.EXPECT().WaitRoomStatus(gomock.Any(), gameRoom, []game_room.GameRoomStatus{game_room.GameStatusReady, game_room.GameStatusError}).Return(game_room.GameStatusError, nil) roomManager.EXPECT().DeleteRoom(gomock.Any(), gameRoom).Return(nil) roomManager.EXPECT().GetRoomInstance(gomock.Any(), gameRoom.SchedulerID, gameRoom.ID).Return(nil, errors.NewErrUnexpected("some error")) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory( - gomock.Any(), - op, - `The GRU could not be validated. The room created for validation with ID id-1 is entering in error state. You can check if - the GRU image is stable on its logs using the provided room id. Last event in the game room: unknown.`, - ) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "Major version detected, starting game room validation process...") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "1º Attempt: The room created for validation with ID id-1 is entering in error state. You can check if\n\t\tthe GRU image is stable on its logs using the provided room id. Last event in the game room: unknown.") + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, "All validation attempts have failed, operation aborted!") schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) @@ -506,6 +612,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -522,7 +629,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -548,6 +655,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -564,7 +672,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -590,6 +698,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { switchOpID := "switch-active-version-op-id" config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -606,7 +715,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { }) schedulerManager.EXPECT().GetActiveScheduler(gomock.Any(), newScheduler.Name).Return(currentActiveScheduler, nil) schedulerManager.EXPECT().GetSchedulerVersions(gomock.Any(), newScheduler.Name).Return(schedulerVersions, nil) - operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID)) + operationsManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), op, fmt.Sprintf("Enqueued switch active version operation with id: %s", switchOpID)) result := executor.Execute(context.Background(), op, operationDef) @@ -630,6 +739,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -660,6 +770,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { schedulerVersions := []*entities.SchedulerVersion{{Version: "v-----"}} config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -689,6 +800,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -724,6 +836,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -754,6 +867,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -784,6 +898,7 @@ func TestCreateNewSchedulerVersionExecutor_Execute(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -820,6 +935,7 @@ func TestCreateNewSchedulerVersionExecutor_Rollback(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -846,6 +962,7 @@ func TestCreateNewSchedulerVersionExecutor_Rollback(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) @@ -872,6 +989,7 @@ func TestCreateNewSchedulerVersionExecutor_Rollback(t *testing.T) { operationsManager := mockports.NewMockOperationManager(mockCtrl) config := newschedulerversion.Config{ RoomInitializationTimeout: time.Duration(120000), + RoomValidationAttempts: 1, } executor := newschedulerversion.NewExecutor(roomManager, schedulerManager, operationsManager, config) diff --git a/internal/service/config.go b/internal/service/config.go index 40a06225a..fc4ac6dd0 100644 --- a/internal/service/config.go +++ b/internal/service/config.go @@ -38,6 +38,7 @@ import ( const ( healthControllerExecutionIntervalConfigPath = "workers.healthControllerInterval" roomInitializationTimeoutMillisConfigPath = "services.roomManager.roomInitializationTimeoutMillis" + roomRoomValidationAttemptsConfigPath = "services.roomManager.roomValidationAttempts" roomPingTimeoutMillisConfigPath = "services.roomManager.roomPingTimeoutMillis" roomDeletionTimeoutMillisConfigPath = "services.roomManager.roomDeletionTimeoutMillis" operationLeaseTTLMillisConfigPath = "services.operationManager.operationLeaseTTLMillis" @@ -47,9 +48,14 @@ const ( // NewCreateSchedulerVersionConfig instantiate a new CreateSchedulerVersionConfig to be used by the NewSchedulerVersion operation to customize its configuration. func NewCreateSchedulerVersionConfig(c config.Config) newschedulerversion.Config { initializationTimeout := time.Duration(c.GetInt(roomInitializationTimeoutMillisConfigPath)) * time.Millisecond + roomValidationAttempts := c.GetInt(roomRoomValidationAttemptsConfigPath) + if roomValidationAttempts < 1 { + roomValidationAttempts = 1 + } createSchedulerVersionConfig := newschedulerversion.Config{ RoomInitializationTimeout: initializationTimeout, + RoomValidationAttempts: roomValidationAttempts, } return createSchedulerVersionConfig