kubecertagent: attempt to load signer as long as agent labels match

This change updates the kube cert agent to a middle ground behavior
that balances leader election gating with how quickly we load the
signer.

If the agent labels have not changed, we will attempt to load the
signer even if we cannot roll out the latest version of the kube
cert agent deployment.

This gives us the best behavior - we do not have controllers
fighting over the state of the deployment and we still get the
signer loaded quickly.

We will have a minute of downtime when the kube cert agent deployment
changes because the new pods will have to wait to become a leader
and for the new deployment to rollout the new pods.  We would need
to have a per pod deployment if we want to avoid that downtime (but
this would come at the cost of startup time and would require
coordination with the kubelet in regards to pod readiness).

Signed-off-by: Monis Khan <mok@vmware.com>
This commit is contained in:
Monis Khan 2021-09-20 12:39:31 -04:00
parent 9851035e40
commit 0d6bf9db3e
No known key found for this signature in database
GPG Key ID: 52C90ADA01B269B8
2 changed files with 79 additions and 9 deletions

View File

@ -46,7 +46,19 @@ const (
ControllerManagerNamespace = "kube-system"
// agentPodLabelKey is used to identify which pods are created by the kube-cert-agent
// controllers.
// controllers. These values should be updated when an incompatible change is made to
// the kube-cert-agent pods. Doing so will cause about a minute of downtime of the token
// credential request API on upgrade because the new concierge pods will not be able to
// fill their agentController.dynamicCertProvider cache until the new deployment has rolled
// out. This is exacerbated by our leader election which assumes that filling caches only
// requires read requests. On the incompatible kube-cert-agent upgrade case, we have to
// issue a write request to fill a cache (which just points out how hacky this code is).
//
// On an upgrade to a new pinniped version, if the agent label has not changed, the new
// pinniped code is basically saying that it is safe to use the old deployment forever
// (because the update to the new deployment could fail indefinitely). Therefore, if the
// new pinniped code wants to guarantee that any change to the kube cert agent deployment
// has rolled out before attempting to fetch the signer, it must update agentPodLabelValue.
agentPodLabelKey = "kube-cert-agent.pinniped.dev"
agentPodLabelValue = "v3"
@ -268,16 +280,19 @@ func (c *agentController) Sync(ctx controllerlib.Context) error {
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
}
if err := c.createOrUpdateDeployment(ctx, newestControllerManager); err != nil {
err := fmt.Errorf("could not ensure agent deployment: %w", err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
depErr := c.createOrUpdateDeployment(ctx, newestControllerManager)
if depErr != nil {
// it is fine if this call fails because a different concierge pod may have already created a compatible deployment
// thus if the later code is able to find pods with the agent labels that we expect, we will attempt to use them
// this means that we must always change the agent labels when we change the agent pods in an incompatible way
depErr = fmt.Errorf("could not ensure agent deployment: %w", depErr)
}
// Find the latest healthy agent Pod in our namespace.
agentPods, err := c.agentPods.Lister().Pods(c.cfg.Namespace).List(agentLabels)
if err != nil {
err := fmt.Errorf("could not list agent pods: %w", err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
}
newestAgentPod := newestRunningPod(agentPods)
@ -285,25 +300,31 @@ func (c *agentController) Sync(ctx controllerlib.Context) error {
// the CredentialIssuer.
if newestAgentPod == nil {
err := fmt.Errorf("could not find a healthy agent pod (%s)", pluralize(agentPods))
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
}
// Load the Kubernetes API info from the kube-public/cluster-info ConfigMap.
configMap, err := c.kubePublicConfigMaps.Lister().ConfigMaps(ClusterInfoNamespace).Get(clusterInfoName)
if err != nil {
err := fmt.Errorf("failed to get %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason)
return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason)
}
apiInfo, err := c.extractAPIInfo(configMap)
if err != nil {
err := fmt.Errorf("could not extract Kubernetes API endpoint info from %s/%s configmap: %w", ClusterInfoNamespace, clusterInfoName, err)
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotGetClusterInfoStrategyReason)
return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotGetClusterInfoStrategyReason)
}
// Load the certificate and key from the agent pod into our in-memory signer.
if err := c.loadSigningKey(newestAgentPod); err != nil {
return c.failStrategyAndErr(ctx.Context, credIssuer, err, configv1alpha1.CouldNotFetchKeyStrategyReason)
return c.failStrategyAndErr(ctx.Context, credIssuer, firstErr(depErr, err), configv1alpha1.CouldNotFetchKeyStrategyReason)
}
if depErr != nil {
// if we get here, it means that we have successfully loaded a signing key but failed to reconcile the deployment.
// mark the status as failed and re-kick the sync loop until we are happy with the state of the deployment.
return c.failStrategyAndErr(ctx.Context, credIssuer, depErr, configv1alpha1.CouldNotFetchKeyStrategyReason)
}
// Set the CredentialIssuer strategy to successful.
@ -592,3 +613,13 @@ func pluralize(pods []*corev1.Pod) string {
}
return fmt.Sprintf("%d candidates", len(pods))
}
func firstErr(errs ...error) error {
for _, err := range errs {
err := err
if err != nil {
return err
}
}
return fmt.Errorf("all errors were nil but should not have been")
}

View File

@ -907,6 +907,45 @@ func TestAgentController(t *testing.T) {
},
},
},
{
name: "deployment exists has old selector, but agent pod exists with correct labels, configmap is valid, exec succeeds",
pinnipedObjects: []runtime.Object{
initialCredentialIssuer,
},
kubeObjects: []runtime.Object{
healthyKubeControllerManagerPod,
healthyAgentDeploymentWithOldStyleSelector,
healthyAgentPod,
validClusterInfoConfigMap,
},
addKubeReactions: func(clientset *kubefake.Clientset) {
clientset.PrependReactor("delete", "deployments", func(action coretesting.Action) (handled bool, ret runtime.Object, err error) {
return true, nil, fmt.Errorf("some delete error")
})
},
mocks: mockExecSucceeds, // expect an attempt to fill the cache
wantDistinctErrors: []string{
"could not ensure agent deployment: some delete error",
},
wantAgentDeployment: healthyAgentDeploymentWithOldStyleSelector, // couldn't be deleted, so it didn't change
// delete to try to recreate deployment when Selector field changes, but delete always fails, so keeps trying to delete
wantDeploymentActionVerbs: []string{"list", "watch", "delete", "delete"},
wantDistinctLogs: []string{
`kube-cert-agent-controller "level"=0 "msg"="deleting deployment to update immutable Selector field" "deployment"={"name":"pinniped-concierge-kube-cert-agent","namespace":"concierge"} "templatePod"={"name":"kube-controller-manager-1","namespace":"kube-system"}`,
`kube-cert-agent-controller "level"=0 "msg"="successfully loaded signing key from agent pod into cache"`,
},
wantDeploymentDeleteActionOpts: []metav1.DeleteOptions{
testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion),
testutil.NewPreconditions(healthyAgentDeploymentWithOldStyleSelector.UID, healthyAgentDeploymentWithOldStyleSelector.ResourceVersion),
},
wantStrategy: &configv1alpha1.CredentialIssuerStrategy{
Type: configv1alpha1.KubeClusterSigningCertificateStrategyType,
Status: configv1alpha1.ErrorStrategyStatus,
Reason: configv1alpha1.CouldNotFetchKeyStrategyReason,
Message: "could not ensure agent deployment: some delete error",
LastUpdateTime: metav1.NewTime(now),
},
},
{
name: "deployment exists, configmap is valid, exec succeeds",
pinnipedObjects: []runtime.Object{