From 23cd53faeb808e22f1a427a2da694a0c441e4a70 Mon Sep 17 00:00:00 2001 From: Matt Moyer Date: Wed, 21 Apr 2021 17:00:20 -0500 Subject: [PATCH] In kube-cert-agent deleter controller, clean up pods that are stuck in terminal states. This change adjusts the kube-cert-agent "deleter" controller to also delete pods that are unusable because they are no longer "Running". This should make the Concierge recover from scenarios where clusters are suspended and resumed, as well as other edge cases where the `sleep` process in the agent pod exits for some reason. Signed-off-by: Matt Moyer --- internal/controller/kubecertagent/deleter.go | 27 +++++++- .../controller/kubecertagent/deleter_test.go | 64 ++++++++++++++++++- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/internal/controller/kubecertagent/deleter.go b/internal/controller/kubecertagent/deleter.go index dfb66aed..9dc2f6c1 100644 --- a/internal/controller/kubecertagent/deleter.go +++ b/internal/controller/kubecertagent/deleter.go @@ -1,11 +1,13 @@ -// Copyright 2020 the Pinniped contributors. All Rights Reserved. +// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 package kubecertagent import ( "fmt" + "time" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1informers "k8s.io/client-go/informers/core/v1" "k8s.io/client-go/kubernetes" @@ -70,7 +72,7 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error { if err != nil { return err } - if controllerManagerPod == nil || + if controllerManagerPod == nil || inTerminalState(agentPod) || !isAgentPodUpToDate(agentPod, c.agentPodConfig.newAgentPod(controllerManagerPod)) { plog.Debug("deleting agent pod", "pod", klog.KObj(agentPod)) err := c.k8sClient. @@ -85,3 +87,24 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error { return nil } + +func inTerminalState(pod *corev1.Pod) bool { + switch pod.Status.Phase { + + // Running and Pending are non-terminal states. We should not delete pods in these states. + case corev1.PodRunning, corev1.PodPending: + return false + + // Succeeded and Failed are terminal states. If a pod has entered one of these states, we want to delete it so + // it can be recreated by the other controllers. + case corev1.PodSucceeded, corev1.PodFailed: + return true + + // In other cases, we want to delete the pod but more carefully. We only consider the pod "terminal" if it is in + // this state more than 5 minutes after creation. + case corev1.PodUnknown: + fallthrough + default: + return time.Since(pod.CreationTimestamp.Time) > 5*time.Minute + } +} diff --git a/internal/controller/kubecertagent/deleter_test.go b/internal/controller/kubecertagent/deleter_test.go index 2a8b5721..d71bd4e8 100644 --- a/internal/controller/kubecertagent/deleter_test.go +++ b/internal/controller/kubecertagent/deleter_test.go @@ -1,4 +1,4 @@ -// Copyright 2020 the Pinniped contributors. All Rights Reserved. +// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 package kubecertagent @@ -12,6 +12,7 @@ import ( "github.com/sclevine/spec/report" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" kubeinformers "k8s.io/client-go/informers" corev1informers "k8s.io/client-go/informers/core/v1" @@ -122,6 +123,7 @@ func TestDeleterControllerSync(t *testing.T) { controllerManagerPod, agentPod = exampleControllerManagerAndAgentPods( kubeSystemNamespace, agentPodNamespace, "ignored for this test", "ignored for this test", ) + agentPod.Status.Phase = corev1.PodRunning podsGVR = schema.GroupVersionResource{ Group: corev1.SchemeGroupVersion.Group, @@ -494,6 +496,66 @@ func TestDeleterControllerSync(t *testing.T) { }) }) + when("there is an unhealthy agent pod", func() { + it.Before(func() { + // The matching controller-manager pod exists. + r.NoError(kubeSystemInformerClient.Tracker().Add(controllerManagerPod)) + r.NoError(kubeAPIClient.Tracker().Add(controllerManagerPod)) + + }) + + when("in a Failed state", func() { + it.Before(func() { + // The pod is in a "Failed" state, even though it otherwise matches. + agentPod.Status.Phase = corev1.PodFailed + r.NoError(agentInformerClient.Tracker().Add(agentPod)) + r.NoError(kubeAPIClient.Tracker().Add(agentPod)) + }) + + it("deletes the agent pod", func() { + startInformersAndController() + err := controllerlib.TestSync(t, subject, *syncContext) + + r.NoError(err) + requireAgentPodWasDeleted() + }) + }) + + when("in an Unknown state but recent", func() { + it.Before(func() { + agentPod.Status.Phase = corev1.PodUnknown + agentPod.CreationTimestamp = metav1.Now() + r.NoError(agentInformerClient.Tracker().Add(agentPod)) + r.NoError(kubeAPIClient.Tracker().Add(agentPod)) + }) + + it("does nothing", func() { + startInformersAndController() + err := controllerlib.TestSync(t, subject, *syncContext) + + r.NoError(err) + r.Empty(kubeAPIClient.Actions()) + }) + }) + + when("in an Unknown state and older", func() { + it.Before(func() { + agentPod.Status.Phase = corev1.PodUnknown + agentPod.CreationTimestamp = metav1.NewTime(time.Now().Add(-1 * time.Hour)) + r.NoError(agentInformerClient.Tracker().Add(agentPod)) + r.NoError(kubeAPIClient.Tracker().Add(agentPod)) + }) + + it("deletes the agent pod", func() { + startInformersAndController() + err := controllerlib.TestSync(t, subject, *syncContext) + + r.NoError(err) + requireAgentPodWasDeleted() + }) + }) + }) + when("there is no agent pod", func() { it("does nothing", func() { startInformersAndController()