Skip to content

Commit

Permalink
Azure: fix node removal race condition on VMSS deletion
Browse files Browse the repository at this point in the history
When a VMSS is being deleted, instances are removed first. The VMSS
itself will disappear once empty. That delay is generally enough for
kube-controller-manager to delete the corresponding k8s nodes, but
might not when busy or throttled (for instance).

If kubernetes nodes remains after their backing VMSS were removed, Azure
cloud-provider will fail listing that VMSS VMs, and downstream callers
(ie. `InstanceExistsByProviderID`) won't account those errors for a
missing instance. The nodes will remain (still considered as "existing"),
and controller-manager will indefinitely retry to VMSS VMs list it,
draining API calls quotas, potentially causing throttling.

In practice a missing scale set implies instances attributed to that
VMSS don't exists either: `InstanceExistsByProviderID` (part of the
general cloud provider interface) should return false in that case.
  • Loading branch information
bpineau committed Oct 4, 2020
1 parent 0d1ac16 commit ee7cd25
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ import (
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient"
"k8s.io/legacy-cloud-providers/azure/clients/publicipclient/mockpublicipclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmclient/mockvmclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
"k8s.io/legacy-cloud-providers/azure/retry"
)

Expand Down Expand Up @@ -633,6 +635,58 @@ func TestInstanceExistsByProviderID(t *testing.T) {
assert.Equal(t, test.expectedErrMsg, err, test.name)
assert.Equal(t, test.expected, exist, test.name)
}

vmssTestCases := []struct {
name string
providerID string
scaleSet string
vmList []string
expected bool
rerr *retry.Error
}{
{
name: "InstanceExistsByProviderID should return true if VMSS and VM exist",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
scaleSet: "vmssee6c2",
vmList: []string{"vmssee6c2000000"},
expected: true,
},
{
name: "InstanceExistsByProviderID should return false if VMSS exist but VM doesn't",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
scaleSet: "vmssee6c2",
expected: false,
},
{
name: "InstanceExistsByProviderID should return false if VMSS doesn't exist",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/missing-vmss/virtualMachines/0",
rerr: &retry.Error{HTTPStatusCode: 404},
expected: false,
},
}

for _, test := range vmssTestCases {
ss, err := newTestScaleSet(ctrl)
assert.NoError(t, err, test.name)
cloud.VMSet = ss

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
ss.cloud.VirtualMachineScaleSetsClient = mockVMSSClient
ss.cloud.VirtualMachineScaleSetVMsClient = mockVMSSVMClient

expectedScaleSet := buildTestVMSS(test.scaleSet, test.scaleSet)
mockVMSSClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachineScaleSet{expectedScaleSet}, test.rerr).AnyTimes()

expectedVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, test.scaleSet, "", 0, test.vmList, "succeeded", false)
mockVMSSVMClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedVMs, test.rerr).AnyTimes()

mockVMsClient := ss.cloud.VirtualMachinesClient.(*mockvmclient.MockInterface)
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachine{}, nil).AnyTimes()

exist, _ := cloud.InstanceExistsByProviderID(context.Background(), test.providerID)
assert.Equal(t, test.expected, exist, test.name)
}
}

func TestNodeAddressesByProviderID(t *testing.T) {
Expand Down
3 changes: 3 additions & 0 deletions staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,9 @@ func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compu
allVMs, rerr := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, string(compute.InstanceView))
if rerr != nil {
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", rerr)
if rerr.IsNotFound() {
return nil, cloudprovider.InstanceNotFound
}
return nil, rerr.Error()
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ func (err *Error) IsThrottled() bool {
return err.HTTPStatusCode == http.StatusTooManyRequests || err.RetryAfter.After(now())
}

// IsNotFound returns true the if the requested object wasn't found
func (err *Error) IsNotFound() bool {
if err == nil {
return false
}

return err.HTTPStatusCode == http.StatusNotFound
}

// NewError creates a new Error.
func NewError(retriable bool, err error) *Error {
return &Error{
Expand Down

0 comments on commit ee7cd25

Please sign in to comment.