2022-08-24 21:45:55 +00:00
|
|
|
// Copyright 2021-2022 the Pinniped contributors. All Rights Reserved.
|
2021-08-18 04:14:38 +00:00
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
|
|
|
package leaderelection
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"go.uber.org/atomic"
|
|
|
|
appsv1 "k8s.io/api/apps/v1"
|
|
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
2021-08-23 16:50:25 +00:00
|
|
|
"k8s.io/client-go/kubernetes"
|
2021-08-18 04:14:38 +00:00
|
|
|
"k8s.io/client-go/tools/leaderelection"
|
|
|
|
"k8s.io/client-go/tools/leaderelection/resourcelock"
|
|
|
|
|
|
|
|
"go.pinniped.dev/internal/constable"
|
2021-08-29 00:38:50 +00:00
|
|
|
"go.pinniped.dev/internal/controllerinit"
|
2021-08-18 04:14:38 +00:00
|
|
|
"go.pinniped.dev/internal/downward"
|
|
|
|
"go.pinniped.dev/internal/kubeclient"
|
|
|
|
"go.pinniped.dev/internal/plog"
|
|
|
|
)
|
|
|
|
|
|
|
|
const ErrNotLeader constable.Error = "write attempt rejected as client is not leader"
|
|
|
|
|
|
|
|
// New returns a client that has a leader election middleware injected into it.
|
|
|
|
// This middleware will prevent all non-read requests to the Kubernetes API when
|
|
|
|
// the current process does not hold the leader election lock. Unlike normal
|
|
|
|
// leader election where the process blocks until it acquires the lock, this
|
|
|
|
// middleware approach lets the process run as normal for all read requests.
|
|
|
|
// Another difference is that if the process acquires the lock and then loses it
|
|
|
|
// (i.e. a failed renewal), it will not exit (i.e. restart). Instead, it will
|
|
|
|
// simply attempt to acquire the lock again.
|
|
|
|
//
|
|
|
|
// The returned function is blocking and will run the leader election polling
|
|
|
|
// logic and will coordinate lease release with the input controller starter function.
|
|
|
|
func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubeclient.Option) (
|
|
|
|
*kubeclient.Client,
|
2021-08-29 00:38:50 +00:00
|
|
|
controllerinit.RunnerWrapper,
|
2021-08-18 04:14:38 +00:00
|
|
|
error,
|
|
|
|
) {
|
|
|
|
internalClient, err := kubeclient.New(opts...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("could not create internal client for leader election: %w", err)
|
|
|
|
}
|
|
|
|
|
2021-08-23 16:50:25 +00:00
|
|
|
isLeader := &isLeaderTracker{tracker: atomic.NewBool(false)}
|
2021-08-18 04:14:38 +00:00
|
|
|
|
|
|
|
identity := podInfo.Name
|
|
|
|
leaseName := deployment.Name
|
|
|
|
|
2021-08-23 16:50:25 +00:00
|
|
|
leaderElectionConfig := newLeaderElectionConfig(podInfo.Namespace, leaseName, identity, internalClient.Kubernetes, isLeader)
|
2021-08-18 04:14:38 +00:00
|
|
|
|
|
|
|
// validate our config here before we rely on it being functioning below
|
|
|
|
if _, err := leaderelection.NewLeaderElector(leaderElectionConfig); err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("invalid config - could not create leader elector: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
writeOnlyWhenLeader := kubeclient.MiddlewareFunc(func(_ context.Context, rt kubeclient.RoundTrip) {
|
|
|
|
switch rt.Verb() {
|
|
|
|
case kubeclient.VerbGet, kubeclient.VerbList, kubeclient.VerbWatch:
|
|
|
|
// reads are always allowed.
|
|
|
|
// note that while our pods/exec into the kube cert agent pod is a write request from the
|
|
|
|
// perspective of the Kube API, it is semantically a read request since no mutation occurs.
|
|
|
|
// we simply use it to fill a cache, and we need all pods to have a functioning cache.
|
|
|
|
// however, we do not need to handle it here because remotecommand.NewSPDYExecutor uses a
|
|
|
|
// kubeclient.Client.JSONConfig as input. since our middleware logic is only injected into
|
|
|
|
// the generated clientset code, this JSONConfig simply ignores this middleware all together.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2021-08-23 16:50:25 +00:00
|
|
|
if isLeader.canWrite() { // only perform "expensive" test for writes
|
2021-08-18 04:14:38 +00:00
|
|
|
return // we are currently the leader, all actions are permitted
|
|
|
|
}
|
|
|
|
|
|
|
|
rt.MutateRequest(func(_ kubeclient.Object) error {
|
|
|
|
return ErrNotLeader // we are not the leader, fail the write request
|
|
|
|
})
|
|
|
|
})
|
|
|
|
|
|
|
|
leaderElectionOpts := append(
|
|
|
|
// all middleware are always executed so this being the first middleware is not relevant
|
|
|
|
[]kubeclient.Option{kubeclient.WithMiddleware(writeOnlyWhenLeader)},
|
|
|
|
opts..., // do not mutate input slice
|
|
|
|
)
|
|
|
|
|
|
|
|
client, err := kubeclient.New(leaderElectionOpts...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, fmt.Errorf("could not create leader election client: %w", err)
|
|
|
|
}
|
|
|
|
|
2021-08-29 00:38:50 +00:00
|
|
|
controllersWithLeaderElector := func(ctx context.Context, controllers controllerinit.Runner) {
|
|
|
|
plog.Debug("leader election loop start", "identity", identity)
|
|
|
|
defer plog.Debug("leader election loop shutdown", "identity", identity)
|
|
|
|
|
2021-08-18 04:14:38 +00:00
|
|
|
leaderElectorCtx, leaderElectorCancel := context.WithCancel(context.Background()) // purposefully detached context
|
|
|
|
|
|
|
|
go func() {
|
2021-08-23 16:50:25 +00:00
|
|
|
controllers(ctx) // run the controllers with the global context, this blocks until the context is canceled
|
|
|
|
|
|
|
|
if isLeader.stop() { // remove our in-memory leader status before we release the lock
|
|
|
|
plog.Debug("leader lost", "identity", identity, "reason", "controller stop")
|
|
|
|
}
|
2021-08-18 04:14:38 +00:00
|
|
|
leaderElectorCancel() // once the controllers have all stopped, tell the leader elector to release the lock
|
|
|
|
}()
|
|
|
|
|
|
|
|
for { // run (and rerun on release) the leader elector with its own context (blocking)
|
|
|
|
select {
|
|
|
|
case <-leaderElectorCtx.Done():
|
|
|
|
return // keep trying to run until process exit
|
|
|
|
|
|
|
|
default:
|
|
|
|
// blocks while trying to acquire lease, unblocks on release.
|
|
|
|
// note that this creates a new leader elector on each loop to
|
|
|
|
// prevent any bugs from reusing that struct across elections.
|
|
|
|
// our config was validated above so this should never die.
|
|
|
|
leaderelection.RunOrDie(leaderElectorCtx, leaderElectionConfig)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return client, controllersWithLeaderElector, nil
|
|
|
|
}
|
2021-08-23 16:50:25 +00:00
|
|
|
|
|
|
|
func newLeaderElectionConfig(namespace, leaseName, identity string, internalClient kubernetes.Interface, isLeader *isLeaderTracker) leaderelection.LeaderElectionConfig {
|
|
|
|
return leaderelection.LeaderElectionConfig{
|
|
|
|
Lock: &releaseLock{
|
|
|
|
delegate: &resourcelock.LeaseLock{
|
|
|
|
LeaseMeta: metav1.ObjectMeta{
|
|
|
|
Namespace: namespace,
|
|
|
|
Name: leaseName,
|
|
|
|
},
|
|
|
|
Client: internalClient.CoordinationV1(),
|
|
|
|
LockConfig: resourcelock.ResourceLockConfig{
|
|
|
|
Identity: identity,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
isLeader: isLeader,
|
|
|
|
identity: identity,
|
|
|
|
},
|
|
|
|
ReleaseOnCancel: true, // semantics for correct release handled by releaseLock.Update and controllersWithLeaderElector below
|
2021-08-24 18:57:39 +00:00
|
|
|
|
|
|
|
// Copied from defaults used in OpenShift since we want the same semantics:
|
|
|
|
// https://github.com/openshift/library-go/blob/e14e06ba8d476429b10cc6f6c0fcfe6ea4f2c591/pkg/config/leaderelection/leaderelection.go#L87-L109
|
|
|
|
LeaseDuration: 137 * time.Second,
|
|
|
|
RenewDeadline: 107 * time.Second,
|
|
|
|
RetryPeriod: 26 * time.Second,
|
|
|
|
|
2021-08-23 16:50:25 +00:00
|
|
|
Callbacks: leaderelection.LeaderCallbacks{
|
|
|
|
OnStartedLeading: func(_ context.Context) {
|
|
|
|
plog.Debug("leader gained", "identity", identity)
|
|
|
|
isLeader.start()
|
|
|
|
},
|
|
|
|
OnStoppedLeading: func() {
|
|
|
|
if isLeader.stop() { // barring changes to client-go, this branch should only be taken on a panic
|
|
|
|
plog.Debug("leader lost", "identity", identity, "reason", "on stop")
|
|
|
|
}
|
|
|
|
},
|
|
|
|
OnNewLeader: func(newLeader string) {
|
|
|
|
if newLeader == identity {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
plog.Debug("new leader elected", "newLeader", newLeader)
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Name: leaseName,
|
|
|
|
// this must be set to nil because we do not want to associate /healthz with a failed
|
|
|
|
// leader election renewal as we do not want to exit the process if the leader changes.
|
|
|
|
WatchDog: nil,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type isLeaderTracker struct {
|
|
|
|
tracker *atomic.Bool
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *isLeaderTracker) canWrite() bool {
|
|
|
|
return t.tracker.Load()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *isLeaderTracker) start() {
|
|
|
|
t.tracker.Store(true)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *isLeaderTracker) stop() (didStop bool) {
|
2022-08-24 21:45:55 +00:00
|
|
|
return t.tracker.CompareAndSwap(true, false)
|
2021-08-23 16:50:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// note that resourcelock.Interface is an internal, unstable interface.
|
|
|
|
// so while it would be convenient to embed the implementation within
|
|
|
|
// this struct, we need to make sure our Update override is used and
|
|
|
|
// that no other methods are added that change the meaning of the
|
|
|
|
// interface. thus we must have ~20 lines of boilerplate to have the
|
|
|
|
// compiler ensure that we keep up with this interface over time.
|
|
|
|
var _ resourcelock.Interface = &releaseLock{}
|
|
|
|
|
|
|
|
// releaseLock works around a limitation of the client-go leader election code:
|
|
|
|
// there is no "BeforeRelease" callback. By the time the "OnStoppedLeading"
|
|
|
|
// callback runs (this callback is meant to always run at the very end since it
|
|
|
|
// normally terminates the process), we have already released the lock. This
|
|
|
|
// creates a race condition in between the release call (the Update func) and the
|
|
|
|
// stop callback where a different client could acquire the lease while we still
|
|
|
|
// believe that we hold the lease in our in-memory leader status.
|
|
|
|
type releaseLock struct {
|
|
|
|
delegate resourcelock.Interface // do not embed this, see comment above
|
|
|
|
isLeader *isLeaderTracker
|
|
|
|
identity string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *releaseLock) Update(ctx context.Context, ler resourcelock.LeaderElectionRecord) error {
|
|
|
|
// setting an empty HolderIdentity on update means that the client is releasing the lock.
|
|
|
|
// thus we need to make sure to update our in-memory leader status before this occurs
|
|
|
|
// since other clients could immediately acquire the lock. note that even if the Update
|
|
|
|
// call below fails, this client has already chosen to release the lock and thus we must
|
|
|
|
// update the in-memory status regardless of it we succeed in making the Kube API call.
|
|
|
|
// note that while resourcelock.Interface is an unstable interface, the meaning of an
|
|
|
|
// empty HolderIdentity is encoded into the Kube API and thus we can safely rely on that
|
|
|
|
// not changing (since changing that would break older clients).
|
|
|
|
if len(ler.HolderIdentity) == 0 && r.isLeader.stop() {
|
|
|
|
plog.Debug("leader lost", "identity", r.identity, "reason", "release")
|
|
|
|
}
|
|
|
|
|
|
|
|
return r.delegate.Update(ctx, ler)
|
|
|
|
}
|
|
|
|
|
|
|
|
// boilerplate passthrough methods below
|
|
|
|
|
|
|
|
func (r *releaseLock) Get(ctx context.Context) (*resourcelock.LeaderElectionRecord, []byte, error) {
|
|
|
|
return r.delegate.Get(ctx)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *releaseLock) Create(ctx context.Context, ler resourcelock.LeaderElectionRecord) error {
|
|
|
|
return r.delegate.Create(ctx, ler)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *releaseLock) RecordEvent(s string) {
|
|
|
|
r.delegate.RecordEvent(s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *releaseLock) Identity() string {
|
|
|
|
return r.delegate.Identity()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *releaseLock) Describe() string {
|
|
|
|
return r.delegate.Describe()
|
|
|
|
}
|