Skip to content

Commit

Permalink
Add retry mechanism to room validation logic (#512)
Browse files Browse the repository at this point in the history
* Delete specific execution errors, to maintain error history responsibility to each operation executor

* Update new scheduler version executor to populate its execution history according to the validation error

* Update operation executor methods signatures to use error interface instead of a custom one

* Remove commented lines

* Remove useless if

* Fix messages typos

* Remove useless error checking

* Remove useless error type

* Add retry mechanism to room validation logic

* Resolve rebase conflicts

* Revert swagger changes

* Update internal/core/operations/newschedulerversion/new_scheduler_version_executor_test.go

Co-authored-by: Arthur Gonçalves <[email protected]>

Co-authored-by: Arthur Gonçalves <[email protected]>
  • Loading branch information
guilhermocc and arthur29 authored Aug 9, 2022
1 parent 11e027b commit 46c4dcc
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 44 deletions.
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ services:
roomPingTimeoutMillis: 240000
roomInitializationTimeoutMillis: 120000
roomDeletionTimeoutMillis: 120000
roomValidationAttempts: 3
operationManager:
operationLeaseTTLMillis: 5000
eventsForwarder:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ func TestDeleteSchedulerExecutor_Execute(t *testing.T) {
Return(errors.New("some error on storage"))

err := executor.Execute(ctx, op, definition)
require.Equal(t, errors.New("some error on storage"), err)
require.EqualError(t, err, "some error on storage")
})

t.Run("when it fails to delete scheduler in runtime", func(t *testing.T) {
Expand All @@ -303,7 +303,7 @@ func TestDeleteSchedulerExecutor_Execute(t *testing.T) {
runtime.EXPECT().DeleteScheduler(ctx, scheduler).Return(errors.New("some error on runtime"))

err := executor.Execute(ctx, op, definition)
require.Equal(t, errors.New("some error on runtime"), err)
require.EqualError(t, err, "some error on runtime")
})
})
}
Expand Down
16 changes: 12 additions & 4 deletions internal/core/operations/newschedulerversion/messages.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,19 @@
package newschedulerversion

const (
validationTimeoutMessageTemplate = `The GRU could not be validated. Maestro got timeout waiting for the GRU with ID: %s to be ready. You can check if
the GRU image is stable on its logs. If you could not spot any issues, contact the Maestro's responsible team for helping.`
startingValidationMessageTemplate = "Major version detected, starting game room validation process..."

validationPodInErrorMessageTemplate = `The GRU could not be validated. The room created for validation with ID %s is entering in error state. You can check if
enqueuedSwitchVersionMessageTemplate = "Enqueued switch active version operation with id: %s"

validationSuccessMessageTemplate = "%dº Attempt: Game room validation success!"

allAttemptsFailedMessageTemplate = "All validation attempts have failed, operation aborted!"

validationTimeoutMessageTemplate = `%dº Attempt: Got timeout waiting for the GRU with ID: %s to be ready. You can check if
the GRU image is stable on its logs.`

validationPodInErrorMessageTemplate = `%dº Attempt: The room created for validation with ID %s is entering in error state. You can check if
the GRU image is stable on its logs using the provided room id. Last event in the game room: %s.`

validationUnexpectedErrorMessageTemplate = `The GRU could not be validated. Unexpected Error: %s - Contact the Maestro's responsible team for helping.`
validationUnexpectedErrorMessageTemplate = `%dº Attempt: Unexpected Error: %s - Contact the Maestro's responsible team for helping.`
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import (
"fmt"
"time"

"github.com/avast/retry-go/v4"

serviceerrors "github.com/topfreegames/maestro/internal/core/services/errors"

"github.com/topfreegames/maestro/internal/core/entities/game_room"
Expand All @@ -49,6 +51,7 @@ import (
// Config defines configurations for the CreateNewSchedulerVersionExecutor.
type Config struct {
RoomInitializationTimeout time.Duration
RoomValidationAttempts int
}

// CreateNewSchedulerVersionExecutor holds the dependecies to execute the operation to create a new scheduler version.
Expand Down Expand Up @@ -101,9 +104,17 @@ func (ex *CreateNewSchedulerVersionExecutor) Execute(ctx context.Context, op *op
}

if isSchedulerMajorVersion {
validationError := ex.validateGameRoomCreation(ctx, newScheduler, logger)
if ex.treatValidationError(ctx, op, validationError) != nil {
return validationError
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, startingValidationMessageTemplate)
currentAttempt := 0
retryError := retry.Do(func() error {
currentAttempt++
validationError := ex.validateGameRoomCreation(ctx, newScheduler, logger)
return ex.treatValidationError(ctx, op, validationError, currentAttempt)
}, retry.Attempts(uint(ex.config.RoomValidationAttempts)))
if retryError != nil {
logger.Error("game room validation failed after all attempts", zap.Error(retryError))
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, allAttemptsFailedMessageTemplate)
return retryError
}
}

Expand All @@ -112,7 +123,7 @@ func (ex *CreateNewSchedulerVersionExecutor) Execute(ctx context.Context, op *op
return err
}

ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf("enqueued switch active version operation with id: %s", switchOpID))
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(enqueuedSwitchVersionMessageTemplate, switchOpID))
logger.Sugar().Infof("new scheduler version created: %s, is major: %t", newScheduler.Spec.Version, isSchedulerMajorVersion)
logger.Sugar().Infof("%s operation succeded, %s operation enqueued to continue scheduler update process, switching to version %s", opDef.Name(), switch_active_version.OperationName, newScheduler.Spec.Version)
return nil
Expand Down Expand Up @@ -206,20 +217,21 @@ func (ex *CreateNewSchedulerVersionExecutor) RemoveValidationRoomID(schedulerNam
delete(ex.validationRoomIdsMap, schedulerName)
}

func (ex *CreateNewSchedulerVersionExecutor) treatValidationError(ctx context.Context, op *operation.Operation, validationError error) error {
func (ex *CreateNewSchedulerVersionExecutor) treatValidationError(ctx context.Context, op *operation.Operation, validationError error, currentAttempt int) error {
switch {
case errors.Is(validationError, &ValidationPodInErrorError{}):
err := validationError.(*ValidationPodInErrorError)
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationPodInErrorMessageTemplate, err.GameRoomID, err.StatusDescription))
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationPodInErrorMessageTemplate, currentAttempt, err.GameRoomID, err.StatusDescription))
return validationError
case errors.Is(validationError, &ValidationTimeoutError{}):
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationTimeoutMessageTemplate, validationError.(*ValidationTimeoutError).GameRoom.ID))
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationTimeoutMessageTemplate, currentAttempt, validationError.(*ValidationTimeoutError).GameRoom.ID))
return validationError
case validationError != nil:
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationUnexpectedErrorMessageTemplate, validationError.Error()))
ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationUnexpectedErrorMessageTemplate, currentAttempt, validationError.Error()))
return validationError
}

ex.operationManager.AppendOperationEventToExecutionHistory(ctx, op, fmt.Sprintf(validationSuccessMessageTemplate, currentAttempt))
return nil
}

Expand Down
Loading

0 comments on commit 46c4dcc

Please sign in to comment.