Merge pull request #846 from enj/enj/i/faster_kube_cert

kubecertagent: attempt to load signer as long as agent labels match
This commit is contained in:
Mo Khan 2021-09-21 17:03:23 -04:00 committed by GitHub
commit 3bde085c57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 79 additions and 9 deletions

View File

@ -46,7 +46,19 @@ const (
ControllerManagerNamespace = "kube-system" ControllerManagerNamespace = "kube-system"
// agentPodLabelKey is used to identify which pods are created by the kube-cert-agent // agentPodLabelKey is used to identify which pods are created by the kube-cert-agent
// controllers. // controllers. These values should be updated when an incompatible change is made to
// the kube-cert-agent pods. Doing so will cause about a minute of downtime of the token
// credential request API on upgrade because the new concierge pods will not be able to
// fill their agentController.dynamicCertProvider cache until the new deployment has rolled
// out. This is exacerbated by our leader election which assumes that filling caches only
// requires read requests. On the incompatible kube-cert-agent upgrade case, we have to
// issue a write request to fill a cache (which just points out how hacky this code is).
//
// On an upgrade to a new pinniped version, if the agent label has not changed, the new
// pinniped code is basically saying that it is safe to use the old deployment forever
// (because the update to the new deployment could fail indefinitely). Therefore, if the
// new pinniped code wants to guarantee that any change to the kube cert agent deployment
// has rolled out before attempting to fetch the signer, it must update agentPodLabelValue.
agentPodLabelKey = "kube-cert-agent.pinniped.dev" agentPodLabelKey = "kube-cert-agent.pinniped.dev"
agentPodLabelValue = "v3" agentPodLabelValue = "v3"
@ -268,16 +280,19 @@ func (c *agentController) Sync(ctx controllerlib.Context) error {
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
} }
if err := c.createOrUpdateDeployment(ctx, newestControllerManager); err != nil { depErr := c.createOrUpdateDeployment(ctx, newestControllerManager)
err := fmt.Errorf("could not ensure agent deployment: %w", err) if depErr != nil {
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) // it is fine if this call fails because a different concierge pod may have already created a compatible deployment
// thus if the later code is able to find pods with the agent labels that we expect, we will attempt to use them
// this means that we must always change the agent labels when we change the agent pods in an incompatible way
depErr = fmt.Errorf("could not ensure agent deployment: %w", depErr)
} }
// Find the latest healthy agent Pod in our namespace. // Find the latest healthy agent Pod in our namespace.
agentPods, err := c.agentPods.Lister().Pods(c.cfg.Namespace).List(agentLabels) agentPods, err := c.agentPods.Lister().Pods(c.cfg.Namespace).List(agentLabels)
if err != nil { if err != nil {
err := fmt.Errorf("could not list agent pods: %w", err) err := fmt.Errorf("could not list agent pods: %w", err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
} }
newestAgentPod := newestRunningPod(agentPods) newestAgentPod := newestRunningPod(agentPods)
@ -285,25 +300,31 @@ func (c *agentController) Sync(ctx controllerlib.Context) error {
// the CredentialIssuer. // the CredentialIssuer.
if newestAgentPod == nil { if newestAgentPod == nil {
err := fmt.Errorf("could not find a healthy agent pod (%s)", pluralize(agentPods)) err := fmt.Errorf("could not find a healthy agent pod (%s)", pluralize(agentPods))
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
} }
// Load the Kubernetes API info from the kube-public/cluster-info ConfigMap. // Load the Kubernetes API info from the kube-public/cluster-info ConfigMap.
configMap, err := c.kubePublicConfigMaps.Lister().ConfigMaps(ClusterInfoNamespace).Get(clusterInfoName) configMap, err := c.kubePublicConfigMaps.Lister().ConfigMaps(ClusterInfoNamespace).Get(clusterInfoName)
if err != nil { if err != nil {
err := fmt.Errorf("failed to get %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) err := fmt.Errorf("failed to get %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason)
} }
apiInfo, err := c.extractAPIInfo(configMap) apiInfo, err := c.extractAPIInfo(configMap)
if err != nil { if err != nil {
err := fmt.Errorf("could not extract Kubernetes API endpoint info from %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err) err := fmt.Errorf("could not extract Kubernetes API endpoint info from %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason)
} }
// Load the certificate and key from the agent pod into our in-memory signer. // Load the certificate and key from the agent pod into our in-memory signer.
if err := c.loadSigningKey(newestAgentPod); err != nil { if err := c.loadSigningKey(newestAgentPod); err != nil {
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason) return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
}
if depErr != nil {
// if we get here, it means that we have successfully loaded a signing key but failed to reconcile the deployment.
// mark the status as failed and re-kick the sync loop until we are happy with the state of the deployment.
return c.failStrategyAndErr(ctx.Context, credIssuer, depErr, configv1alpha1.CouldNotFetchKeyStrategyReason)
} }
// Set the CredentialIssuer strategy to successful. // Set the CredentialIssuer strategy to successful.
@ -592,3 +613,13 @@ func pluralize(pods []*corev1.Pod) string {
} }
return fmt.Sprintf("%d candidates", len(pods)) return fmt.Sprintf("%d candidates", len(pods))
} }
func firstErr(errs ...error) error {
for _, err := range errs {
err := err
if err != nil {
return err
}
}
return fmt.Errorf("all errors were nil but should not have been")
}

View File

@ -907,6 +907,45 @@ func TestAgentController(t *testing.T) {
}, },
}, },
}, },
{
name: "deployment exists has old selector, but agent pod exists with correct labels, configmap is valid, exec succeeds",
pinnipedObjects: []runtime.Object{
initialCredentialIssuer,
},
kubeObjects: []runtime.Object{
healthyKubeControllerManagerPod,
healthyAgentDeploymentWithOldStyleSelector,
healthyAgentPod,
validClusterInfoConfigMap,
},
addKubeReactions: func(clientset *kubefake.Clientset) {
clientset.PrependReactor("delete", "deployments", func(action coretesting.Action) (handled bool, ret runtime.Object, err error) {
return true, nil, fmt.Errorf("some delete error")
})
},
mocks: mockExecSucceeds, // expect an attempt to fill the cache
wantDistinctErrors: []string{
"could not ensure agent deployment: some delete error",
},
wantAgentDeployment: healthyAgentDeploymentWithOldStyleSelector, // couldn't be deleted, so it didn't change
// delete to try to recreate deployment when Selector field changes, but delete always fails, so keeps trying to delete
wantDeploymentActionVerbs: []string{"list", "watch", "delete", "delete"},
wantDistinctLogs: []string{
`kube-cert-agent-controller "level"=0 "msg"="deleting deployment to update immutable Selector field" "deployment"={"name":"pinniped-concierge-kube-cert-agent","namespace":"concierge"} "templatePod"={"name":"kube-controller-manager-1","namespace":"kube-system"}`,
`kube-cert-agent-controller "level"=0 "msg"="successfully loaded signing key from agent pod into cache"`,
},
wantDeploymentDeleteActionOpts: []metav1.DeleteOptions{
testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion),
testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion),
},
wantStrategy: &configv1alpha1.CredentialIssuerStrategy{
Type: configv1alpha1.KubeClusterSigningCertificateStrategyType,
Status: configv1alpha1.ErrorStrategyStatus,
Reason: configv1alpha1.CouldNotFetchKeyStrategyReason,
Message: "could not ensure agent deployment: some delete error",
LastUpdateTime: metav1.NewTime(now),
},
},
{ {
name: "deployment exists, configmap is valid, exec succeeds", name: "deployment exists, configmap is valid, exec succeeds",
pinnipedObjects: []runtime.Object{ pinnipedObjects: []runtime.Object{