From ca6c29e46351b09e16f579c6b8dbad8f99282dc2 Mon Sep 17 00:00:00 2001
From: Ryan Richard <richardry@vmware.com>
Date: Wed, 20 Sep 2023 16:51:23 -0700
Subject: [PATCH] Fix deadlock during shutdown which prevented leader election
 cleanup

Before this fix, the deadlock would prevent the leader pod from giving
up its lease, which would make it take several minutes for new pods to
be allowed to elect a new leader. During that time, no Pinniped
controllers could write to the Kube API, so important resources were not
being updated during that window. It would also make pod shutdown take
about 1 minute.

After this fix, the leader gives up its lease immediately, and pod
shutdown takes about 1 second. This improves restart/upgrade time and
also fixes the problem where there was no leader for several minutes
after a restart/upgrade.

The deadlock was between the post-start hook and the pre-shutdown hook.
The pre-shutdown hook blocked until a certain background goroutine in
the post-start hook finished, but that goroutine could not finish until
the pre-shutdown hook finished. Thus, they were both blocked, waiting
for each other infinitely. Eventually the process would be externally
killed.

This deadlock was most likely introduced by some change in Kube's
generic api server package related to how the many complex channels used
during server shutdown interact with each other, and was not noticed
when we upgraded to the version which introduced the change.
---
 internal/concierge/apiserver/apiserver.go  | 32 +++++++++++++---------
 internal/concierge/server/server.go        | 13 ++++++++-
 internal/controllerlib/controller.go       |  5 +++-
 internal/leaderelection/leaderelection.go  |  1 +
 internal/supervisor/apiserver/apiserver.go | 32 +++++++++++++---------
 internal/supervisor/server/server.go       | 10 +++++--
 6 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/internal/concierge/apiserver/apiserver.go b/internal/concierge/apiserver/apiserver.go
index e1e7d074..56f7a8a8 100644
--- a/internal/concierge/apiserver/apiserver.go
+++ b/internal/concierge/apiserver/apiserver.go
@@ -107,39 +107,45 @@ func (c completedConfig) New() (*PinnipedServer, error) {
 		return nil, fmt.Errorf("could not install API groups: %w", err)
 	}
 
-	shutdown := &sync.WaitGroup{}
+	controllersShutdownWaitGroup := &sync.WaitGroup{}
+	controllersCtx, cancelControllerCtx := context.WithCancel(context.Background())
+
 	s.GenericAPIServer.AddPostStartHookOrDie("start-controllers",
 		func(postStartContext genericapiserver.PostStartHookContext) error {
 			plog.Debug("start-controllers post start hook starting")
+			defer plog.Debug("start-controllers post start hook completed")
 
-			ctx, cancel := context.WithCancel(context.Background())
-			go func() {
-				defer cancel()
-
-				<-postStartContext.StopCh
-			}()
-
-			runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(ctx)
+			runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(controllersCtx)
 			if err != nil {
 				return fmt.Errorf("cannot create run controller func: %w", err)
 			}
 
-			shutdown.Add(1)
+			controllersShutdownWaitGroup.Add(1)
 			go func() {
-				defer shutdown.Done()
+				// When this goroutine ends, then also end the WaitGroup, allowing anyone who called Wait() to proceed.
+				defer controllersShutdownWaitGroup.Done()
 
-				runControllers(ctx)
+				// Start the controllers and block until their context is cancelled and they have shut down.
+				runControllers(controllersCtx)
+				plog.Debug("start-controllers post start hook's background goroutine saw runControllers() finish")
 			}()
 
 			return nil
 		},
 	)
+
 	s.GenericAPIServer.AddPreShutdownHookOrDie("stop-controllers",
 		func() error {
 			plog.Debug("stop-controllers pre shutdown hook starting")
 			defer plog.Debug("stop-controllers pre shutdown hook completed")
 
-			shutdown.Wait()
+			// The generic api server is telling us that it wants to shut down, so tell our controllers that we
+			// want them to shut down by cancelling their context.
+			cancelControllerCtx()
+
+			// Now wait for the controllers to finish shutting down. By blocking here, we prevent the generic api server's
+			// graceful shutdown process from continuing until we are finished shutting down our own controllers.
+			controllersShutdownWaitGroup.Wait()
 
 			return nil
 		},
diff --git a/internal/concierge/server/server.go b/internal/concierge/server/server.go
index 95fa26c5..4a04f39f 100644
--- a/internal/concierge/server/server.go
+++ b/internal/concierge/server/server.go
@@ -100,6 +100,7 @@ func addCommandlineFlagsToCommand(cmd *cobra.Command, app *App) {
 }
 
 // Boot the aggregated API server, which will in turn boot the controllers.
+// In practice, the ctx passed in should be one which will be cancelled when the process receives SIGTERM or SIGINT.
 func (a *App) runServer(ctx context.Context) error {
 	// Read the server config file.
 	cfg, err := concierge.FromPath(ctx, a.configPath)
@@ -186,7 +187,9 @@ func (a *App) runServer(ctx context.Context) error {
 		return fmt.Errorf("could not create aggregated API server: %w", err)
 	}
 
-	// Run the server. Its post-start hook will start the controllers.
+	// Run the server. Its post-start hook will start the controllers. Its pre shutdown hook will be called when ctx is
+	// cancelled, and that hook should graceful stop the controllers and give up the leader election lease. See the
+	// code for these hooks in internal/concierge/apiserver.go.
 	return server.GenericAPIServer.PrepareRun().Run(ctx.Done())
 }
 
@@ -276,8 +279,16 @@ func main() error {
 		"time-since-build", timeSinceCompile,
 	)
 
+	// This context will be cancelled upon the first SIGTERM or SIGINT, and will os.Exit() to kill the process
+	// upon the second SIGTERM or SIGINT.
 	ctx := genericapiserver.SetupSignalContext()
 
+	// Just for debugging purposes, log when the first signal is received.
+	go func() {
+		<-ctx.Done() // wait for the Done channel to be closed, indicating that ctx was cancelled by the signal handler
+		plog.Debug("concierge shutdown initiated due to process receiving SIGTERM or SIGINT")
+	}()
+
 	return New(ctx, os.Args[1:], os.Stdout, os.Stderr).Run()
 }
 
diff --git a/internal/controllerlib/controller.go b/internal/controllerlib/controller.go
index 9314cd47..9ce2e484 100644
--- a/internal/controllerlib/controller.go
+++ b/internal/controllerlib/controller.go
@@ -1,4 +1,4 @@
-// Copyright 2020 the Pinniped contributors. All Rights Reserved.
+// Copyright 2020-2023 the Pinniped contributors. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 package controllerlib
@@ -102,6 +102,7 @@ func (c *controller) Run(ctx context.Context, workers int) {
 	workerContext, workerContextCancel := context.WithCancel(context.Background())
 
 	defer func() {
+		plog.Debug("starting to shut down controller workers", "controller", c.Name(), "workers", workers)
 		c.queue.ShutDown()    // shutdown the controller queue first
 		workerContextCancel() // cancel the worker context, which tell workers to initiate shutdown
 
@@ -126,7 +127,9 @@ func (c *controller) Run(ctx context.Context, workers int) {
 		}()
 	}
 
+	plog.Debug("controller started", "controller", c.Name(), "workers", workers)
 	<-ctx.Done() // wait for controller context to be cancelled
+	plog.Debug("controller context cancelled, next will terminate workers", "controller", c.Name(), "workers", workers)
 }
 
 func (c *controller) invokeAllRunOpts() {
diff --git a/internal/leaderelection/leaderelection.go b/internal/leaderelection/leaderelection.go
index 82e349da..4feab236 100644
--- a/internal/leaderelection/leaderelection.go
+++ b/internal/leaderelection/leaderelection.go
@@ -98,6 +98,7 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec
 
 		go func() {
 			controllers(ctx) // run the controllers with the global context, this blocks until the context is canceled
+			plog.Debug("leader election saw controllers have stopped")
 
 			if isLeader.stop() { // remove our in-memory leader status before we release the lock
 				plog.Debug("leader lost", "identity", identity, "reason", "controller stop")
diff --git a/internal/supervisor/apiserver/apiserver.go b/internal/supervisor/apiserver/apiserver.go
index 5c0a4c9a..6fd040be 100644
--- a/internal/supervisor/apiserver/apiserver.go
+++ b/internal/supervisor/apiserver/apiserver.go
@@ -113,39 +113,45 @@ func (c completedConfig) New() (*PinnipedServer, error) {
 		return nil, fmt.Errorf("could not install API groups: %w", err)
 	}
 
-	shutdown := &sync.WaitGroup{}
+	controllersShutdownWaitGroup := &sync.WaitGroup{}
+	controllersCtx, cancelControllerCtx := context.WithCancel(context.Background())
+
 	s.GenericAPIServer.AddPostStartHookOrDie("start-controllers",
 		func(postStartContext genericapiserver.PostStartHookContext) error {
 			plog.Debug("start-controllers post start hook starting")
+			defer plog.Debug("start-controllers post start hook completed")
 
-			ctx, cancel := context.WithCancel(context.Background())
-			go func() {
-				defer cancel()
-
-				<-postStartContext.StopCh
-			}()
-
-			runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(ctx)
+			runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(controllersCtx)
 			if err != nil {
 				return fmt.Errorf("cannot create run controller func: %w", err)
 			}
 
-			shutdown.Add(1)
+			controllersShutdownWaitGroup.Add(1)
 			go func() {
-				defer shutdown.Done()
+				// When this goroutine ends, then also end the WaitGroup, allowing anyone who called Wait() to proceed.
+				defer controllersShutdownWaitGroup.Done()
 
-				runControllers(ctx)
+				// Start the controllers and block until their context is cancelled and they have shut down.
+				runControllers(controllersCtx)
+				plog.Debug("start-controllers post start hook's background goroutine saw runControllers() finish")
 			}()
 
 			return nil
 		},
 	)
+
 	s.GenericAPIServer.AddPreShutdownHookOrDie("stop-controllers",
 		func() error {
 			plog.Debug("stop-controllers pre shutdown hook starting")
 			defer plog.Debug("stop-controllers pre shutdown hook completed")
 
-			shutdown.Wait()
+			// The generic api server is telling us that it wants to shut down, so tell our controllers that we
+			// want them to shut down by cancelling their context.
+			cancelControllerCtx()
+
+			// Now wait for the controllers to finish shutting down. By blocking here, we prevent the generic api server's
+			// graceful shutdown process from continuing until we are finished shutting down our own controllers.
+			controllersShutdownWaitGroup.Wait()
 
 			return nil
 		},
diff --git a/internal/supervisor/server/server.go b/internal/supervisor/server/server.go
index d6605b2b..333884dc 100644
--- a/internal/supervisor/server/server.go
+++ b/internal/supervisor/server/server.go
@@ -382,8 +382,10 @@ func prepareControllers(
 	return controllerinit.Prepare(controllerManager.Start, leaderElector, kubeInformers, pinnipedInformers)
 }
 
-//nolint:funlen
-func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervisor.Config) error {
+// Boot the aggregated API server, which will in turn boot the controllers. Also open the appropriate network ports
+// and start serving the health endpoint and the endpoints of the configured FederationDomains.
+// In practice, the ctx passed in should be one which will be cancelled when the process receives SIGTERM or SIGINT.
+func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervisor.Config) error { //nolint:funlen
 	serverInstallationNamespace := podInfo.Namespace
 	clientSecretSupervisorGroupData := groupsuffix.SupervisorAggregatedGroups(*cfg.APIGroupSuffix)
 
@@ -575,7 +577,9 @@ func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervis
 	plog.Debug("supervisor started")
 	defer plog.Debug("supervisor exiting")
 
-	// Run the server. Its post-start hook will start the controllers.
+	// Run the server. Its post-start hook will start the controllers. Its pre shutdown hook will be called when ctx is
+	// cancelled, and that hook should graceful stop the controllers and give up the leader election lease. See the
+	// code for these hooks in internal/supervisor/apiserver.go.
 	err = server.GenericAPIServer.PrepareRun().Run(ctx.Done())
 	if err != nil {
 		return err