leader election: use better duration defaults

OpenShift has good defaults for these duration fields that we can
use instead of coming up with them ourselves:

e14e06ba8d/pkg/config/leaderelection/leaderelection.go (L87-L109)

Copied here for easy future reference:

// We want to be able to tolerate 60s of kube-apiserver disruption without causing pod restarts.
// We want the graceful lease re-acquisition fairly quick to avoid waits on new deployments and other rollouts.
// We want a single set of guidance for nearly every lease in openshift.  If you're special, we'll let you know.
// 1. clock skew tolerance is leaseDuration-renewDeadline == 30s
// 2. kube-apiserver downtime tolerance is == 78s
//      lastRetry=floor(renewDeadline/retryPeriod)*retryPeriod == 104
//      downtimeTolerance = lastRetry-retryPeriod == 78s
// 3. worst non-graceful lease acquisition is leaseDuration+retryPeriod == 163s
// 4. worst graceful lease acquisition is retryPeriod == 26s
if ret.LeaseDuration.Duration == 0 {
	ret.LeaseDuration.Duration = 137 * time.Second
}

if ret.RenewDeadline.Duration == 0 {
	// this gives 107/26=4 retries and allows for 137-107=30 seconds of clock skew
	// if the kube-apiserver is unavailable for 60s starting just before t=26 (the first renew),
	// then we will retry on 26s intervals until t=104 (kube-apiserver came back up at 86), and there will
	// be 33 seconds of extra time before the lease is lost.
	ret.RenewDeadline.Duration = 107 * time.Second
}
if ret.RetryPeriod.Duration == 0 {
	ret.RetryPeriod.Duration = 26 * time.Second
}

Signed-off-by: Monis Khan <mok@vmware.com>
This commit is contained in:
Monis Khan 2021-08-24 14:57:39 -04:00
parent c0617ceda4
commit c71ffdcd1e
No known key found for this signature in database
GPG Key ID: 52C90ADA01B269B8
2 changed files with 8 additions and 4 deletions

View File

@ -136,9 +136,13 @@ func newLeaderElectionConfig(namespace, leaseName, identity string, internalClie
identity: identity, identity: identity,
}, },
ReleaseOnCancel: true, // semantics for correct release handled by releaseLock.Update and controllersWithLeaderElector below ReleaseOnCancel: true, // semantics for correct release handled by releaseLock.Update and controllersWithLeaderElector below
LeaseDuration: 60 * time.Second,
RenewDeadline: 15 * time.Second, // Copied from defaults used in OpenShift since we want the same semantics:
RetryPeriod: 5 * time.Second, // https://github.com/openshift/library-go/blob/e14e06ba8d476429b10cc6f6c0fcfe6ea4f2c591/pkg/config/leaderelection/leaderelection.go#L87-L109
LeaseDuration: 137 * time.Second,
RenewDeadline: 107 * time.Second,
RetryPeriod: 26 * time.Second,
Callbacks: leaderelection.LeaderCallbacks{ Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(_ context.Context) { OnStartedLeading: func(_ context.Context) {
plog.Debug("leader gained", "identity", identity) plog.Debug("leader gained", "identity", identity)

View File

@ -205,7 +205,7 @@ func waitForIdentity(ctx context.Context, t *testing.T, namespace *corev1.Namesp
} }
out = lease out = lease
return lease.Spec.HolderIdentity != nil && identities.Has(*lease.Spec.HolderIdentity), nil return lease.Spec.HolderIdentity != nil && identities.Has(*lease.Spec.HolderIdentity), nil
}, 3*time.Minute, time.Second) }, 5*time.Minute, time.Second)
return out return out
} }