From 0d6bf9db3e66f587817acf21fe2af68581038dcb Mon Sep 17 00:00:00 2001 From: Monis Khan Date: Mon, 20 Sep 2021 12:39:31 -0400 Subject: [PATCH] kubecertagent: attempt to load signer as long as agent labels match This change updates the kube cert agent to a middle ground behavior that balances leader election gating with how quickly we load the signer. If the agent labels have not changed, we will attempt to load the signer even if we cannot roll out the latest version of the kube cert agent deployment. This gives us the best behavior - we do not have controllers fighting over the state of the deployment and we still get the signer loaded quickly. We will have a minute of downtime when the kube cert agent deployment changes because the new pods will have to wait to become a leader and for the new deployment to rollout the new pods. We would need to have a per pod deployment if we want to avoid that downtime (but this would come at the cost of startup time and would require coordination with the kubelet in regards to pod readiness). Signed-off-by: Monis Khan --- .../controller/kubecertagent/kubecertagent.go | 49 +++++++++++++++---- .../kubecertagent/kubecertagent_test.go | 39 +++++++++++++++ 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/internal/controller/kubecertagent/kubecertagent.go b/internal/controller/kubecertagent/kubecertagent.go index e5651f65..35768309 100644 --- a/internal/controller/kubecertagent/kubecertagent.go +++ b/internal/controller/kubecertagent/kubecertagent.go @@ -46,7 +46,19 @@ const ( ControllerManagerNamespace = "kube-system" // agentPodLabelKey is used to identify which pods are created by the kube-cert-agent - // controllers. + // controllers. These values should be updated when an incompatible change is made to + // the kube-cert-agent pods. Doing so will cause about a minute of downtime of the token + // credential request API on upgrade because the new concierge pods will not be able to + // fill their agentController.dynamicCertProvider cache until the new deployment has rolled + // out. This is exacerbated by our leader election which assumes that filling caches only + // requires read requests. On the incompatible kube-cert-agent upgrade case, we have to + // issue a write request to fill a cache (which just points out how hacky this code is). + // + // On an upgrade to a new pinniped version, if the agent label has not changed, the new + // pinniped code is basically saying that it is safe to use the old deployment forever + // (because the update to the new deployment could fail indefinitely). Therefore, if the + // new pinniped code wants to guarantee that any change to the kube cert agent deployment + // has rolled out before attempting to fetch the signer, it must update agentPodLabelValue. agentPodLabelKey = "kube-cert-agent.pinniped.dev" agentPodLabelValue = "v3" @@ -268,16 +280,19 @@ func (c *agentController) Sync(ctx controllerlib.Context) error { return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) } - if err := c.createOrUpdateDeployment(ctx, newestControllerManager); err != nil { - err := fmt.Errorf("could not ensure agent deployment: %w", err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + depErr := c.createOrUpdateDeployment(ctx, newestControllerManager) + if depErr != nil { + // it is fine if this call fails because a different concierge pod may have already created a compatible deployment + // thus if the later code is able to find pods with the agent labels that we expect, we will attempt to use them + // this means that we must always change the agent labels when we change the agent pods in an incompatible way + depErr = fmt.Errorf("could not ensure agent deployment: %w", depErr) } // Find the latest healthy agent Pod in our namespace. agentPods, err := c.agentPods.Lister().Pods(c.cfg.Namespace).List(agentLabels) if err != nil { err := fmt.Errorf("could not list agent pods: %w", err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) } newestAgentPod := newestRunningPod(agentPods) @@ -285,25 +300,31 @@ func (c *agentController) Sync(ctx controllerlib.Context) error { // the CredentialIssuer. if newestAgentPod == nil { err := fmt.Errorf("could not find a healthy agent pod (%s)", pluralize(agentPods)) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) } // Load the Kubernetes API info from the kube-public/cluster-info ConfigMap. configMap, err := c.kubePublicConfigMaps.Lister().ConfigMaps(ClusterInfoNamespace).Get(clusterInfoName) if err != nil { err := fmt.Errorf("failed to get %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason) } apiInfo, err := c.extractAPIInfo(configMap) if err != nil { err := fmt.Errorf("could not extract Kubernetes API endpoint info from %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason) } // Load the certificate and key from the agent pod into our in-memory signer. if err := c.loadSigningKey(newestAgentPod); err != nil { - return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) + return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason) + } + + if depErr != nil { + // if we get here, it means that we have successfully loaded a signing key but failed to reconcile the deployment. + // mark the status as failed and re-kick the sync loop until we are happy with the state of the deployment. + return c.failStrategyAndErr(ctx.Context, credIssuer, depErr, configv1alpha1.CouldNotFetchKeyStrategyReason) } // Set the CredentialIssuer strategy to successful. @@ -592,3 +613,13 @@ func pluralize(pods []*corev1.Pod) string { } return fmt.Sprintf("%d candidates", len(pods)) } + +func firstErr(errs ...error) error { + for _, err := range errs { + err := err + if err != nil { + return err + } + } + return fmt.Errorf("all errors were nil but should not have been") +} diff --git a/internal/controller/kubecertagent/kubecertagent_test.go b/internal/controller/kubecertagent/kubecertagent_test.go index 7c77d541..9bead468 100644 --- a/internal/controller/kubecertagent/kubecertagent_test.go +++ b/internal/controller/kubecertagent/kubecertagent_test.go @@ -907,6 +907,45 @@ func TestAgentController(t *testing.T) { }, }, }, + { + name: "deployment exists has old selector, but agent pod exists with correct labels, configmap is valid, exec succeeds", + pinnipedObjects: []runtime.Object{ + initialCredentialIssuer, + }, + kubeObjects: []runtime.Object{ + healthyKubeControllerManagerPod, + healthyAgentDeploymentWithOldStyleSelector, + healthyAgentPod, + validClusterInfoConfigMap, + }, + addKubeReactions: func(clientset *kubefake.Clientset) { + clientset.PrependReactor("delete", "deployments", func(action coretesting.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, fmt.Errorf("some delete error") + }) + }, + mocks: mockExecSucceeds, // expect an attempt to fill the cache + wantDistinctErrors: []string{ + "could not ensure agent deployment: some delete error", + }, + wantAgentDeployment: healthyAgentDeploymentWithOldStyleSelector, // couldn't be deleted, so it didn't change + // delete to try to recreate deployment when Selector field changes, but delete always fails, so keeps trying to delete + wantDeploymentActionVerbs: []string{"list", "watch", "delete", "delete"}, + wantDistinctLogs: []string{ + `kube-cert-agent-controller "level"=0 "msg"="deleting deployment to update immutable Selector field" "deployment"={"name":"pinniped-concierge-kube-cert-agent","namespace":"concierge"} "templatePod"={"name":"kube-controller-manager-1","namespace":"kube-system"}`, + `kube-cert-agent-controller "level"=0 "msg"="successfully loaded signing key from agent pod into cache"`, + }, + wantDeploymentDeleteActionOpts: []metav1.DeleteOptions{ + testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion), + testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion), + }, + wantStrategy: &configv1alpha1.CredentialIssuerStrategy{ + Type: configv1alpha1.KubeClusterSigningCertificateStrategyType, + Status: configv1alpha1.ErrorStrategyStatus, + Reason: configv1alpha1.CouldNotFetchKeyStrategyReason, + Message: "could not ensure agent deployment: some delete error", + LastUpdateTime: metav1.NewTime(now), + }, + }, { name: "deployment exists, configmap is valid, exec succeeds", pinnipedObjects: []runtime.Object{