diff --git a/internal/controller/kubecertagent/kubecertagent.go b/internal/controller/kubecertagent/kubecertagent.go index e5651f65..35768309 100644 --- a/internal/controller/kubecertagent/kubecertagent.go +++ b/internal/controller/kubecertagent/kubecertagent.go @@ -46,7 +46,19 @@ const ( ControllerManagerNamespace = "kube-system" // agentPodLabelKey is used to identify which pods are created by the kube-cert-agent - // controllers. + // controllers. These values should be updated when an incompatible change is made to + // the kube-cert-agent pods. Doing so will cause about a minute of downtime of the token + // credential request API on upgrade because the new concierge pods will not be able to + // fill their agentController.dynamicCertProvider cache until the new deployment has rolled + // out. This is exacerbated by our leader election which assumes that filling caches only + // requires read requests. On the incompatible kube-cert-agent upgrade case, we have to + // issue a write request to fill a cache (which just points out how hacky this code is). + // + // On an upgrade to a new pinniped version, if the agent label has not changed, the new + // pinniped code is basically saying that it is safe to use the old deployment forever + // (because the update to the new deployment could fail indefinitely). Therefore, if the + // new pinniped code wants to guarantee that any change to the kube cert agent deployment + // has rolled out before attempting to fetch the signer, it must update agentPodLabelValue. agentPodLabelKey = "kube-cert-agent.pinniped.dev" agentPodLabelValue = "v3" @@ -268,16 +280,19 @@ func (c *agentController) Sync(ctx controllerlib.Context) error { return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) } - if err := c.createOrUpdateDeployment(ctx, newestControllerManager); err != nil { - err := fmt.Errorf("could not ensure agent deployment: %w", err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + depErr := c.createOrUpdateDeployment(ctx, newestControllerManager) + if depErr != nil { + // it is fine if this call fails because a different concierge pod may have already created a compatible deployment + // thus if the later code is able to find pods with the agent labels that we expect, we will attempt to use them + // this means that we must always change the agent labels when we change the agent pods in an incompatible way + depErr = fmt.Errorf("could not ensure agent deployment: %w", depErr) } // Find the latest healthy agent Pod in our namespace. agentPods, err := c.agentPods.Lister().Pods(c.cfg.Namespace).List(agentLabels) if err != nil { err := fmt.Errorf("could not list agent pods: %w", err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) } newestAgentPod := newestRunningPod(agentPods) @@ -285,25 +300,31 @@ func (c *agentController) Sync(ctx controllerlib.Context) error { // the CredentialIssuer. if newestAgentPod == nil { err := fmt.Errorf("could not find a healthy agent pod (%s)", pluralize(agentPods)) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) } // Load the Kubernetes API info from the kube-public/cluster-info ConfigMap. configMap, err := c.kubePublicConfigMaps.Lister().ConfigMaps(ClusterInfoNamespace).Get(clusterInfoName) if err != nil { err := fmt.Errorf("failed to get %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason) } apiInfo, err := c.extractAPIInfo(configMap) if err != nil { err := fmt.Errorf("could not extract Kubernetes API endpoint info from %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason) } // Load the certificate and key from the agent pod into our in-memory signer. if err := c.loadSigningKey(newestAgentPod); err != nil { - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) + } + + if depErr != nil { + // if we get here, it means that we have successfully loaded a signing key but failed to reconcile the deployment. + // mark the status as failed and re-kick the sync loop until we are happy with the state of the deployment. + return c.failStrategyAndErr(ctx.Context, credIssuer, depErr, configv1alpha1.CouldNotFetchKeyStrategyReason) } // Set the CredentialIssuer strategy to successful. @@ -592,3 +613,13 @@ func pluralize(pods []*corev1.Pod) string { } return fmt.Sprintf("%d candidates", len(pods)) } + +func firstErr(errs ...error) error { + for _, err := range errs { + err := err + if err != nil { + return err + } + } + return fmt.Errorf("all errors were nil but should not have been") +} diff --git a/internal/controller/kubecertagent/kubecertagent_test.go b/internal/controller/kubecertagent/kubecertagent_test.go index 7c77d541..9bead468 100644 --- a/internal/controller/kubecertagent/kubecertagent_test.go +++ b/internal/controller/kubecertagent/kubecertagent_test.go @@ -907,6 +907,45 @@ func TestAgentController(t *testing.T) { }, }, }, + { + name: "deployment exists has old selector, but agent pod exists with correct labels, configmap is valid, exec succeeds", + pinnipedObjects: []runtime.Object{ + initialCredentialIssuer, + }, + kubeObjects: []runtime.Object{ + healthyKubeControllerManagerPod, + healthyAgentDeploymentWithOldStyleSelector, + healthyAgentPod, + validClusterInfoConfigMap, + }, + addKubeReactions: func(clientset *kubefake.Clientset) { + clientset.PrependReactor("delete", "deployments", func(action coretesting.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, fmt.Errorf("some delete error") + }) + }, + mocks: mockExecSucceeds, // expect an attempt to fill the cache + wantDistinctErrors: []string{ + "could not ensure agent deployment: some delete error", + }, + wantAgentDeployment: healthyAgentDeploymentWithOldStyleSelector, // couldn't be deleted, so it didn't change + // delete to try to recreate deployment when Selector field changes, but delete always fails, so keeps trying to delete + wantDeploymentActionVerbs: []string{"list", "watch", "delete", "delete"}, + wantDistinctLogs: []string{ + `kube-cert-agent-controller "level"=0 "msg"="deleting deployment to update immutable Selector field" "deployment"={"name":"pinniped-concierge-kube-cert-agent","namespace":"concierge"} "templatePod"={"name":"kube-controller-manager-1","namespace":"kube-system"}`, + `kube-cert-agent-controller "level"=0 "msg"="successfully loaded signing key from agent pod into cache"`, + }, + wantDeploymentDeleteActionOpts: []metav1.DeleteOptions{ + testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion), + testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion), + }, + wantStrategy: &configv1alpha1.CredentialIssuerStrategy{ + Type: configv1alpha1.KubeClusterSigningCertificateStrategyType, + Status: configv1alpha1.ErrorStrategyStatus, + Reason: configv1alpha1.CouldNotFetchKeyStrategyReason, + Message: "could not ensure agent deployment: some delete error", + LastUpdateTime: metav1.NewTime(now), + }, + }, { name: "deployment exists, configmap is valid, exec succeeds", pinnipedObjects: []runtime.Object{