In kube-cert-agent deleter controller, clean up pods that are stuck in terminal states.

This change adjusts the kube-cert-agent "deleter" controller to also delete pods that are unusable because they are no longer "Running".

This should make the Concierge recover from scenarios where clusters are suspended and resumed, as well as other edge cases where the `sleep` process in the agent pod exits for some reason.

Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
Matt Moyer 2021-04-21 17:00:20 -05:00
parent 24c8bdef44
commit 23cd53faeb
No known key found for this signature in database
GPG Key ID: EAE88AD172C5AE2D
2 changed files with 88 additions and 3 deletions

View File

@ -1,11 +1,13 @@
// Copyright 2020 the Pinniped contributors. All Rights Reserved. // Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package kubecertagent package kubecertagent
import ( import (
"fmt" "fmt"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
corev1informers "k8s.io/client-go/informers/core/v1" corev1informers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
@ -70,7 +72,7 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error {
if err != nil { if err != nil {
return err return err
} }
if controllerManagerPod == nil || if controllerManagerPod == nil || inTerminalState(agentPod) ||
!isAgentPodUpToDate(agentPod, c.agentPodConfig.newAgentPod(controllerManagerPod)) { !isAgentPodUpToDate(agentPod, c.agentPodConfig.newAgentPod(controllerManagerPod)) {
plog.Debug("deleting agent pod", "pod", klog.KObj(agentPod)) plog.Debug("deleting agent pod", "pod", klog.KObj(agentPod))
err := c.k8sClient. err := c.k8sClient.
@ -85,3 +87,24 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error {
return nil return nil
} }
func inTerminalState(pod *corev1.Pod) bool {
switch pod.Status.Phase {
// Running and Pending are non-terminal states. We should not delete pods in these states.
case corev1.PodRunning, corev1.PodPending:
return false
// Succeeded and Failed are terminal states. If a pod has entered one of these states, we want to delete it so
// it can be recreated by the other controllers.
case corev1.PodSucceeded, corev1.PodFailed:
return true
// In other cases, we want to delete the pod but more carefully. We only consider the pod "terminal" if it is in
// this state more than 5 minutes after creation.
case corev1.PodUnknown:
fallthrough
default:
return time.Since(pod.CreationTimestamp.Time) > 5*time.Minute
}
}

View File

@ -1,4 +1,4 @@
// Copyright 2020 the Pinniped contributors. All Rights Reserved. // Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package kubecertagent package kubecertagent
@ -12,6 +12,7 @@ import (
"github.com/sclevine/spec/report" "github.com/sclevine/spec/report"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/runtime/schema"
kubeinformers "k8s.io/client-go/informers" kubeinformers "k8s.io/client-go/informers"
corev1informers "k8s.io/client-go/informers/core/v1" corev1informers "k8s.io/client-go/informers/core/v1"
@ -122,6 +123,7 @@ func TestDeleterControllerSync(t *testing.T) {
controllerManagerPod, agentPod = exampleControllerManagerAndAgentPods( controllerManagerPod, agentPod = exampleControllerManagerAndAgentPods(
kubeSystemNamespace, agentPodNamespace, "ignored for this test", "ignored for this test", kubeSystemNamespace, agentPodNamespace, "ignored for this test", "ignored for this test",
) )
agentPod.Status.Phase = corev1.PodRunning
podsGVR = schema.GroupVersionResource{ podsGVR = schema.GroupVersionResource{
Group: corev1.SchemeGroupVersion.Group, Group: corev1.SchemeGroupVersion.Group,
@ -494,6 +496,66 @@ func TestDeleterControllerSync(t *testing.T) {
}) })
}) })
when("there is an unhealthy agent pod", func() {
it.Before(func() {
// The matching controller-manager pod exists.
r.NoError(kubeSystemInformerClient.Tracker().Add(controllerManagerPod))
r.NoError(kubeAPIClient.Tracker().Add(controllerManagerPod))
})
when("in a Failed state", func() {
it.Before(func() {
// The pod is in a "Failed" state, even though it otherwise matches.
agentPod.Status.Phase = corev1.PodFailed
r.NoError(agentInformerClient.Tracker().Add(agentPod))
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
})
it("deletes the agent pod", func() {
startInformersAndController()
err := controllerlib.TestSync(t, subject, *syncContext)
r.NoError(err)
requireAgentPodWasDeleted()
})
})
when("in an Unknown state but recent", func() {
it.Before(func() {
agentPod.Status.Phase = corev1.PodUnknown
agentPod.CreationTimestamp = metav1.Now()
r.NoError(agentInformerClient.Tracker().Add(agentPod))
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
})
it("does nothing", func() {
startInformersAndController()
err := controllerlib.TestSync(t, subject, *syncContext)
r.NoError(err)
r.Empty(kubeAPIClient.Actions())
})
})
when("in an Unknown state and older", func() {
it.Before(func() {
agentPod.Status.Phase = corev1.PodUnknown
agentPod.CreationTimestamp = metav1.NewTime(time.Now().Add(-1 * time.Hour))
r.NoError(agentInformerClient.Tracker().Add(agentPod))
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
})
it("deletes the agent pod", func() {
startInformersAndController()
err := controllerlib.TestSync(t, subject, *syncContext)
r.NoError(err)
requireAgentPodWasDeleted()
})
})
})
when("there is no agent pod", func() { when("there is no agent pod", func() {
it("does nothing", func() { it("does nothing", func() {
startInformersAndController() startInformersAndController()