Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Internal] Retry on 504 when calling the permission API #4355

Merged
merged 2 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions common/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ package common

import (
"context"
"log"
"errors"
"regexp"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/logger"
"github.com/databricks/databricks-sdk-go/retries"
)

Expand All @@ -15,11 +17,27 @@ func RetryOnTimeout[T any](ctx context.Context, f func(context.Context) (*T, err
msg := err.Error()
isTimeout := timeoutRegex.MatchString(msg)
if isTimeout {
log.Printf("[DEBUG] Retrying due to timeout: %s", msg)
logger.Debugf(ctx, "Retrying due to timeout: %s", msg)
}
return isTimeout
}))
return r.Run(ctx, func(ctx context.Context) (*T, error) {
return f(ctx)
})
}

// RetryOn504 returns a [retries.Retrier] that calls the given method
// until it either succeeds or returns an error that is different from
// [apierr.ErrDeadlineExceeded].
func RetryOn504[T any](ctx context.Context, f func(context.Context) (*T, error)) (*T, error) {
r := retries.New[T](retries.WithTimeout(-1), retries.WithRetryFunc(func(err error) bool {
if !errors.Is(err, apierr.ErrDeadlineExceeded) {
return false
}
logger.Debugf(ctx, "Retrying on error 504")
return true
}))
return r.Run(ctx, func(ctx context.Context) (*T, error) {
return f(ctx)
})
}
85 changes: 85 additions & 0 deletions common/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"testing"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/experimental/mocks"
"github.com/databricks/databricks-sdk-go/service/workspace"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -47,3 +48,87 @@ func TestRetryOnTimeout_NonRetriableError(t *testing.T) {
})
assert.ErrorIs(t, err, expected)
}

func TestRetryOn504_noError(t *testing.T) {
wantErr := error(nil)
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 1

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls += 1
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_errorNot504(t *testing.T) {
wantErr := errors.New("test error")
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 1

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls += 1
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_error504ThenFail(t *testing.T) {
wantErr := errors.New("test error")
wantRes := (*workspace.ObjectInfo)(nil)
wantCalls := 2

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 1
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls++
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}

func TestRetryOn504_error504ThenSuccess(t *testing.T) {
wantErr := error(nil)
wantRes := &workspace.ObjectInfo{}
wantCalls := 2

w := mocks.NewMockWorkspaceClient(t)
api := w.GetMockWorkspaceAPI().EXPECT()
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 1
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)

gotCalls := 0
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
gotCalls++
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
})

assert.ErrorIs(t, gotErr, wantErr)
assert.Equal(t, gotRes, wantRes)
assert.Equal(t, gotCalls, wantCalls)
}
12 changes: 9 additions & 3 deletions permissions/resource_permissions.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,16 @@ func (a PermissionsAPI) readRaw(objectID string, mapping resourcePermissions) (*
}
idParts := strings.Split(objectID, "/")
id := idParts[len(idParts)-1]
permissions, err := w.Permissions.Get(a.context, iam.GetPermissionRequest{
RequestObjectId: id,
RequestObjectType: mapping.requestObjectType,

// TODO: This a temporary measure to implement retry on 504 until this is
// supported natively in the Go SDK.
permissions, err := common.RetryOn504(a.context, func(ctx context.Context) (*iam.ObjectPermissions, error) {
return w.Permissions.Get(a.context, iam.GetPermissionRequest{
RequestObjectId: id,
RequestObjectType: mapping.requestObjectType,
})
})

var apiErr *apierr.APIError
// https://github.com/databricks/terraform-provider-databricks/issues/1227
// platform propagates INVALID_STATE error for auto-purged clusters in
Expand Down
30 changes: 30 additions & 0 deletions permissions/resource_permissions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,36 @@ func TestResourcePermissionsRead_EmptyListResultsInRemoval(t *testing.T) {
}.ApplyNoError(t)
}

func TestResourcePermissionsRead_EmptyListResultsInRemovalWith504Errors(t *testing.T) {
qa.ResourceFixture{
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {
mwc.GetMockCurrentUserAPI().EXPECT().Me(mock.Anything).Return(&iam.User{UserName: "admin"}, nil)

req := iam.GetPermissionRequest{
RequestObjectId: "abc",
RequestObjectType: "clusters",
}

// Fail 3 times with a 504 error. These should be retried
// transparently.
call := mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(nil, apierr.ErrDeadlineExceeded)
call.Repeatability = 3

mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(&iam.ObjectPermissions{
ObjectId: "/clusters/abc",
ObjectType: "cluster",
}, nil)
},
Resource: ResourcePermissions(),
Read: true,
Removed: true,
InstanceState: map[string]string{
"cluster_id": "abc",
},
ID: "/clusters/abc",
}.ApplyNoError(t)
}

func TestResourcePermissionsDelete(t *testing.T) {
d, err := qa.ResourceFixture{
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {
Expand Down
Loading