In kube-cert-agent deleter controller, clean up pods that are stuck in terminal states.
This change adjusts the kube-cert-agent "deleter" controller to also delete pods that are unusable because they are no longer "Running". This should make the Concierge recover from scenarios where clusters are suspended and resumed, as well as other edge cases where the `sleep` process in the agent pod exits for some reason. Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
parent
24c8bdef44
commit
23cd53faeb
@ -1,11 +1,13 @@
|
||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
||||
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package kubecertagent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
corev1informers "k8s.io/client-go/informers/core/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
@ -70,7 +72,7 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if controllerManagerPod == nil ||
|
||||
if controllerManagerPod == nil || inTerminalState(agentPod) ||
|
||||
!isAgentPodUpToDate(agentPod, c.agentPodConfig.newAgentPod(controllerManagerPod)) {
|
||||
plog.Debug("deleting agent pod", "pod", klog.KObj(agentPod))
|
||||
err := c.k8sClient.
|
||||
@ -85,3 +87,24 @@ func (c *deleterController) Sync(ctx controllerlib.Context) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func inTerminalState(pod *corev1.Pod) bool {
|
||||
switch pod.Status.Phase {
|
||||
|
||||
// Running and Pending are non-terminal states. We should not delete pods in these states.
|
||||
case corev1.PodRunning, corev1.PodPending:
|
||||
return false
|
||||
|
||||
// Succeeded and Failed are terminal states. If a pod has entered one of these states, we want to delete it so
|
||||
// it can be recreated by the other controllers.
|
||||
case corev1.PodSucceeded, corev1.PodFailed:
|
||||
return true
|
||||
|
||||
// In other cases, we want to delete the pod but more carefully. We only consider the pod "terminal" if it is in
|
||||
// this state more than 5 minutes after creation.
|
||||
case corev1.PodUnknown:
|
||||
fallthrough
|
||||
default:
|
||||
return time.Since(pod.CreationTimestamp.Time) > 5*time.Minute
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
||||
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package kubecertagent
|
||||
@ -12,6 +12,7 @@ import (
|
||||
"github.com/sclevine/spec/report"
|
||||
"github.com/stretchr/testify/require"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
kubeinformers "k8s.io/client-go/informers"
|
||||
corev1informers "k8s.io/client-go/informers/core/v1"
|
||||
@ -122,6 +123,7 @@ func TestDeleterControllerSync(t *testing.T) {
|
||||
controllerManagerPod, agentPod = exampleControllerManagerAndAgentPods(
|
||||
kubeSystemNamespace, agentPodNamespace, "ignored for this test", "ignored for this test",
|
||||
)
|
||||
agentPod.Status.Phase = corev1.PodRunning
|
||||
|
||||
podsGVR = schema.GroupVersionResource{
|
||||
Group: corev1.SchemeGroupVersion.Group,
|
||||
@ -494,6 +496,66 @@ func TestDeleterControllerSync(t *testing.T) {
|
||||
})
|
||||
})
|
||||
|
||||
when("there is an unhealthy agent pod", func() {
|
||||
it.Before(func() {
|
||||
// The matching controller-manager pod exists.
|
||||
r.NoError(kubeSystemInformerClient.Tracker().Add(controllerManagerPod))
|
||||
r.NoError(kubeAPIClient.Tracker().Add(controllerManagerPod))
|
||||
|
||||
})
|
||||
|
||||
when("in a Failed state", func() {
|
||||
it.Before(func() {
|
||||
// The pod is in a "Failed" state, even though it otherwise matches.
|
||||
agentPod.Status.Phase = corev1.PodFailed
|
||||
r.NoError(agentInformerClient.Tracker().Add(agentPod))
|
||||
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
|
||||
})
|
||||
|
||||
it("deletes the agent pod", func() {
|
||||
startInformersAndController()
|
||||
err := controllerlib.TestSync(t, subject, *syncContext)
|
||||
|
||||
r.NoError(err)
|
||||
requireAgentPodWasDeleted()
|
||||
})
|
||||
})
|
||||
|
||||
when("in an Unknown state but recent", func() {
|
||||
it.Before(func() {
|
||||
agentPod.Status.Phase = corev1.PodUnknown
|
||||
agentPod.CreationTimestamp = metav1.Now()
|
||||
r.NoError(agentInformerClient.Tracker().Add(agentPod))
|
||||
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
|
||||
})
|
||||
|
||||
it("does nothing", func() {
|
||||
startInformersAndController()
|
||||
err := controllerlib.TestSync(t, subject, *syncContext)
|
||||
|
||||
r.NoError(err)
|
||||
r.Empty(kubeAPIClient.Actions())
|
||||
})
|
||||
})
|
||||
|
||||
when("in an Unknown state and older", func() {
|
||||
it.Before(func() {
|
||||
agentPod.Status.Phase = corev1.PodUnknown
|
||||
agentPod.CreationTimestamp = metav1.NewTime(time.Now().Add(-1 * time.Hour))
|
||||
r.NoError(agentInformerClient.Tracker().Add(agentPod))
|
||||
r.NoError(kubeAPIClient.Tracker().Add(agentPod))
|
||||
})
|
||||
|
||||
it("deletes the agent pod", func() {
|
||||
startInformersAndController()
|
||||
err := controllerlib.TestSync(t, subject, *syncContext)
|
||||
|
||||
r.NoError(err)
|
||||
requireAgentPodWasDeleted()
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
when("there is no agent pod", func() {
|
||||
it("does nothing", func() {
|
||||
startInformersAndController()
|
||||
|
Loading…
Reference in New Issue
Block a user