diff --git a/internal/concierge/apiserver/apiserver.go b/internal/concierge/apiserver/apiserver.go index e1e7d074..56f7a8a8 100644 --- a/internal/concierge/apiserver/apiserver.go +++ b/internal/concierge/apiserver/apiserver.go @@ -107,39 +107,45 @@ func (c completedConfig) New() (*PinnipedServer, error) { return nil, fmt.Errorf("could not install API groups: %w", err) } - shutdown := &sync.WaitGroup{} + controllersShutdownWaitGroup := &sync.WaitGroup{} + controllersCtx, cancelControllerCtx := context.WithCancel(context.Background()) + s.GenericAPIServer.AddPostStartHookOrDie("start-controllers", func(postStartContext genericapiserver.PostStartHookContext) error { plog.Debug("start-controllers post start hook starting") + defer plog.Debug("start-controllers post start hook completed") - ctx, cancel := context.WithCancel(context.Background()) - go func() { - defer cancel() - - <-postStartContext.StopCh - }() - - runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(ctx) + runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(controllersCtx) if err != nil { return fmt.Errorf("cannot create run controller func: %w", err) } - shutdown.Add(1) + controllersShutdownWaitGroup.Add(1) go func() { - defer shutdown.Done() + // When this goroutine ends, then also end the WaitGroup, allowing anyone who called Wait() to proceed. + defer controllersShutdownWaitGroup.Done() - runControllers(ctx) + // Start the controllers and block until their context is cancelled and they have shut down. + runControllers(controllersCtx) + plog.Debug("start-controllers post start hook's background goroutine saw runControllers() finish") }() return nil }, ) + s.GenericAPIServer.AddPreShutdownHookOrDie("stop-controllers", func() error { plog.Debug("stop-controllers pre shutdown hook starting") defer plog.Debug("stop-controllers pre shutdown hook completed") - shutdown.Wait() + // The generic api server is telling us that it wants to shut down, so tell our controllers that we + // want them to shut down by cancelling their context. + cancelControllerCtx() + + // Now wait for the controllers to finish shutting down. By blocking here, we prevent the generic api server's + // graceful shutdown process from continuing until we are finished shutting down our own controllers. + controllersShutdownWaitGroup.Wait() return nil }, diff --git a/internal/concierge/server/server.go b/internal/concierge/server/server.go index 95fa26c5..4a04f39f 100644 --- a/internal/concierge/server/server.go +++ b/internal/concierge/server/server.go @@ -100,6 +100,7 @@ func addCommandlineFlagsToCommand(cmd *cobra.Command, app *App) { } // Boot the aggregated API server, which will in turn boot the controllers. +// In practice, the ctx passed in should be one which will be cancelled when the process receives SIGTERM or SIGINT. func (a *App) runServer(ctx context.Context) error { // Read the server config file. cfg, err := concierge.FromPath(ctx, a.configPath) @@ -186,7 +187,9 @@ func (a *App) runServer(ctx context.Context) error { return fmt.Errorf("could not create aggregated API server: %w", err) } - // Run the server. Its post-start hook will start the controllers. + // Run the server. Its post-start hook will start the controllers. Its pre shutdown hook will be called when ctx is + // cancelled, and that hook should graceful stop the controllers and give up the leader election lease. See the + // code for these hooks in internal/concierge/apiserver.go. return server.GenericAPIServer.PrepareRun().Run(ctx.Done()) } @@ -276,8 +279,16 @@ func main() error { "time-since-build", timeSinceCompile, ) + // This context will be cancelled upon the first SIGTERM or SIGINT, and will os.Exit() to kill the process + // upon the second SIGTERM or SIGINT. ctx := genericapiserver.SetupSignalContext() + // Just for debugging purposes, log when the first signal is received. + go func() { + <-ctx.Done() // wait for the Done channel to be closed, indicating that ctx was cancelled by the signal handler + plog.Debug("concierge shutdown initiated due to process receiving SIGTERM or SIGINT") + }() + return New(ctx, os.Args[1:], os.Stdout, os.Stderr).Run() } diff --git a/internal/controllerlib/controller.go b/internal/controllerlib/controller.go index 9314cd47..9ce2e484 100644 --- a/internal/controllerlib/controller.go +++ b/internal/controllerlib/controller.go @@ -1,4 +1,4 @@ -// Copyright 2020 the Pinniped contributors. All Rights Reserved. +// Copyright 2020-2023 the Pinniped contributors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 package controllerlib @@ -102,6 +102,7 @@ func (c *controller) Run(ctx context.Context, workers int) { workerContext, workerContextCancel := context.WithCancel(context.Background()) defer func() { + plog.Debug("starting to shut down controller workers", "controller", c.Name(), "workers", workers) c.queue.ShutDown() // shutdown the controller queue first workerContextCancel() // cancel the worker context, which tell workers to initiate shutdown @@ -126,7 +127,9 @@ func (c *controller) Run(ctx context.Context, workers int) { }() } + plog.Debug("controller started", "controller", c.Name(), "workers", workers) <-ctx.Done() // wait for controller context to be cancelled + plog.Debug("controller context cancelled, next will terminate workers", "controller", c.Name(), "workers", workers) } func (c *controller) invokeAllRunOpts() { diff --git a/internal/leaderelection/leaderelection.go b/internal/leaderelection/leaderelection.go index 82e349da..4feab236 100644 --- a/internal/leaderelection/leaderelection.go +++ b/internal/leaderelection/leaderelection.go @@ -98,6 +98,7 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec go func() { controllers(ctx) // run the controllers with the global context, this blocks until the context is canceled + plog.Debug("leader election saw controllers have stopped") if isLeader.stop() { // remove our in-memory leader status before we release the lock plog.Debug("leader lost", "identity", identity, "reason", "controller stop") diff --git a/internal/supervisor/apiserver/apiserver.go b/internal/supervisor/apiserver/apiserver.go index 5c0a4c9a..6fd040be 100644 --- a/internal/supervisor/apiserver/apiserver.go +++ b/internal/supervisor/apiserver/apiserver.go @@ -113,39 +113,45 @@ func (c completedConfig) New() (*PinnipedServer, error) { return nil, fmt.Errorf("could not install API groups: %w", err) } - shutdown := &sync.WaitGroup{} + controllersShutdownWaitGroup := &sync.WaitGroup{} + controllersCtx, cancelControllerCtx := context.WithCancel(context.Background()) + s.GenericAPIServer.AddPostStartHookOrDie("start-controllers", func(postStartContext genericapiserver.PostStartHookContext) error { plog.Debug("start-controllers post start hook starting") + defer plog.Debug("start-controllers post start hook completed") - ctx, cancel := context.WithCancel(context.Background()) - go func() { - defer cancel() - - <-postStartContext.StopCh - }() - - runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(ctx) + runControllers, err := c.ExtraConfig.BuildControllersPostStartHook(controllersCtx) if err != nil { return fmt.Errorf("cannot create run controller func: %w", err) } - shutdown.Add(1) + controllersShutdownWaitGroup.Add(1) go func() { - defer shutdown.Done() + // When this goroutine ends, then also end the WaitGroup, allowing anyone who called Wait() to proceed. + defer controllersShutdownWaitGroup.Done() - runControllers(ctx) + // Start the controllers and block until their context is cancelled and they have shut down. + runControllers(controllersCtx) + plog.Debug("start-controllers post start hook's background goroutine saw runControllers() finish") }() return nil }, ) + s.GenericAPIServer.AddPreShutdownHookOrDie("stop-controllers", func() error { plog.Debug("stop-controllers pre shutdown hook starting") defer plog.Debug("stop-controllers pre shutdown hook completed") - shutdown.Wait() + // The generic api server is telling us that it wants to shut down, so tell our controllers that we + // want them to shut down by cancelling their context. + cancelControllerCtx() + + // Now wait for the controllers to finish shutting down. By blocking here, we prevent the generic api server's + // graceful shutdown process from continuing until we are finished shutting down our own controllers. + controllersShutdownWaitGroup.Wait() return nil }, diff --git a/internal/supervisor/server/server.go b/internal/supervisor/server/server.go index d6605b2b..333884dc 100644 --- a/internal/supervisor/server/server.go +++ b/internal/supervisor/server/server.go @@ -382,8 +382,10 @@ func prepareControllers( return controllerinit.Prepare(controllerManager.Start, leaderElector, kubeInformers, pinnipedInformers) } -//nolint:funlen -func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervisor.Config) error { +// Boot the aggregated API server, which will in turn boot the controllers. Also open the appropriate network ports +// and start serving the health endpoint and the endpoints of the configured FederationDomains. +// In practice, the ctx passed in should be one which will be cancelled when the process receives SIGTERM or SIGINT. +func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervisor.Config) error { //nolint:funlen serverInstallationNamespace := podInfo.Namespace clientSecretSupervisorGroupData := groupsuffix.SupervisorAggregatedGroups(*cfg.APIGroupSuffix) @@ -575,7 +577,9 @@ func runSupervisor(ctx context.Context, podInfo *downward.PodInfo, cfg *supervis plog.Debug("supervisor started") defer plog.Debug("supervisor exiting") - // Run the server. Its post-start hook will start the controllers. + // Run the server. Its post-start hook will start the controllers. Its pre shutdown hook will be called when ctx is + // cancelled, and that hook should graceful stop the controllers and give up the leader election lease. See the + // code for these hooks in internal/supervisor/apiserver.go. err = server.GenericAPIServer.PrepareRun().Run(ctx.Done()) if err != nil { return err