From 14bfc52d36f0ba093f3665c4575741610681b38c Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Wed, 20 Jan 2016 17:49:40 -0800 Subject: [PATCH 1/2] Enforce minimum backoff to avoid simultaneous validation on one engine. Signed-off-by: Dong Chen --- cluster/engine.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cluster/engine.go b/cluster/engine.go index 1d56b0fdb9..3c752df985 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -239,7 +239,9 @@ func (e *Engine) TimeToValidate() bool { } sinceLastUpdate := time.Since(e.updatedAt) // Increase check interval for a pending engine according to failureCount and cap it at a limit - if sinceLastUpdate > validationLimit || sinceLastUpdate > time.Duration(e.failureCount)*failureBackoff { + // '+1' would enforce a minimum backoff because e.failureCount could be 0 at first join, or + // the engine has a duplicate ID + if sinceLastUpdate > validationLimit || sinceLastUpdate > time.Duration(e.failureCount+1)*failureBackoff { return true } return false From 46a33f70ffdbe08ff7ea813b320d6cee17b345e8 Mon Sep 17 00:00:00 2001 From: Dong Chen Date: Thu, 21 Jan 2016 14:02:34 -0800 Subject: [PATCH 2/2] Use exponential backoff strategy to validate pending engine. Signed-off-by: Dong Chen --- cluster/engine.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cluster/engine.go b/cluster/engine.go index 3c752df985..f091eda0db 100644 --- a/cluster/engine.go +++ b/cluster/engine.go @@ -231,7 +231,7 @@ func (e *Engine) setState(state engineState) { // TimeToValidate returns true if a pending node is up for validation func (e *Engine) TimeToValidate() bool { const validationLimit time.Duration = 4 * time.Hour - const failureBackoff time.Duration = 30 * time.Second + const minFailureBackoff time.Duration = 30 * time.Second e.Lock() defer e.Unlock() if e.state != statePending { @@ -239,9 +239,10 @@ func (e *Engine) TimeToValidate() bool { } sinceLastUpdate := time.Since(e.updatedAt) // Increase check interval for a pending engine according to failureCount and cap it at a limit - // '+1' would enforce a minimum backoff because e.failureCount could be 0 at first join, or - // the engine has a duplicate ID - if sinceLastUpdate > validationLimit || sinceLastUpdate > time.Duration(e.failureCount+1)*failureBackoff { + // it's exponential backoff = 2 ^ failureCount + minFailureBackoff. A minimum backoff is + // needed because e.failureCount could be 0 at first join, or the engine has a duplicate ID + if sinceLastUpdate > validationLimit || + sinceLastUpdate > (1<