Add a test to verify that the kube-cert-agent recovers when a pod becomes unhealthy.
This required some small adjustments to the produciton code to make it more feasible to test. The new test takes an existing agent pod and terminates the `sleep` process, causing the pod to go into an `Error` status. The agent controllers _should_ respond to this by deleting and recreating that failed pod, but the current code just gets stuck. This is meant to replicate the situation when a cluster is suspended and resumed, which also causes the agent pod to be in this terminal error state. Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
parent
4375c01afb
commit
24c8bdef44
@ -25,7 +25,7 @@ RUN mkdir out \
|
|||||||
|
|
||||||
# Use a runtime image based on Debian slim
|
# Use a runtime image based on Debian slim
|
||||||
FROM debian:10.9-slim
|
FROM debian:10.9-slim
|
||||||
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y ca-certificates procps && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy the binaries from the build-env stage
|
# Copy the binaries from the build-env stage
|
||||||
COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge
|
COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge
|
||||||
|
@ -129,7 +129,7 @@ func (c *AgentPodConfig) newAgentPod(controllerManagerPod *corev1.Pod) *corev1.P
|
|||||||
Name: "sleeper",
|
Name: "sleeper",
|
||||||
Image: c.ContainerImage,
|
Image: c.ContainerImage,
|
||||||
ImagePullPolicy: corev1.PullIfNotPresent,
|
ImagePullPolicy: corev1.PullIfNotPresent,
|
||||||
Command: []string{"/bin/sleep", "infinity"},
|
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
|
||||||
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
||||||
Resources: corev1.ResourceRequirements{
|
Resources: corev1.ResourceRequirements{
|
||||||
Limits: corev1.ResourceList{
|
Limits: corev1.ResourceList{
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
package kubecertagent
|
package kubecertagent
|
||||||
@ -101,7 +101,7 @@ func exampleControllerManagerAndAgentPods(
|
|||||||
Image: "some-agent-image",
|
Image: "some-agent-image",
|
||||||
ImagePullPolicy: corev1.PullIfNotPresent,
|
ImagePullPolicy: corev1.PullIfNotPresent,
|
||||||
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
||||||
Command: []string{"/bin/sleep", "infinity"},
|
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
|
||||||
Resources: corev1.ResourceRequirements{
|
Resources: corev1.ResourceRequirements{
|
||||||
Limits: corev1.ResourceList{
|
Limits: corev1.ResourceList{
|
||||||
corev1.ResourceMemory: resource.MustParse("16Mi"),
|
corev1.ResourceMemory: resource.MustParse("16Mi"),
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
package integration
|
package integration
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
@ -14,8 +15,12 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/apimachinery/pkg/api/equality"
|
"k8s.io/apimachinery/pkg/api/equality"
|
||||||
|
k8serrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/util/diff"
|
"k8s.io/apimachinery/pkg/util/diff"
|
||||||
|
"k8s.io/client-go/kubernetes"
|
||||||
|
"k8s.io/client-go/kubernetes/scheme"
|
||||||
|
"k8s.io/client-go/tools/remotecommand"
|
||||||
|
|
||||||
"go.pinniped.dev/test/library"
|
"go.pinniped.dev/test/library"
|
||||||
)
|
)
|
||||||
@ -27,11 +32,15 @@ const (
|
|||||||
func TestKubeCertAgent(t *testing.T) {
|
func TestKubeCertAgent(t *testing.T) {
|
||||||
env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable)
|
env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable)
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
kubeClient := library.NewClientset(t)
|
kubeClient := library.NewClientset(t)
|
||||||
|
|
||||||
|
// Make sure the agent pods are running and healthy before the tests begin.
|
||||||
|
t.Logf("waiting for agent pods to become running before tests")
|
||||||
|
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||||
|
|
||||||
// Get the current number of kube-cert-agent pods.
|
// Get the current number of kube-cert-agent pods.
|
||||||
//
|
//
|
||||||
// We can pretty safely assert there should be more than 1, since there should be a
|
// We can pretty safely assert there should be more than 1, since there should be a
|
||||||
@ -98,26 +107,121 @@ func TestKubeCertAgent(t *testing.T) {
|
|||||||
updatedAgentPod.Spec.Tolerations,
|
updatedAgentPod.Spec.Tolerations,
|
||||||
corev1.Toleration{Key: "fake-toleration"},
|
corev1.Toleration{Key: "fake-toleration"},
|
||||||
)
|
)
|
||||||
|
t.Logf("updating agent pod %s/%s with a fake toleration", updatedAgentPod.Namespace, updatedAgentPod.Name)
|
||||||
_, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{})
|
_, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
// Make sure the original pods come back.
|
// Make sure the original pods come back.
|
||||||
|
t.Logf("waiting for agent pods to reconcile")
|
||||||
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Make sure the pods all become healthy.
|
||||||
|
t.Logf("waiting for agent pods to become running")
|
||||||
|
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("reconcile on delete", func(t *testing.T) {
|
t.Run("reconcile on delete", func(t *testing.T) {
|
||||||
// Delete the first pod. The controller should see it, and flip it back.
|
// Delete the first pod. The controller should see it, and flip it back.
|
||||||
|
podToDelete := originalAgentPods.Items[0]
|
||||||
|
t.Logf("deleting agent pod %s/%s", podToDelete.Namespace, podToDelete.Name)
|
||||||
err = kubeClient.
|
err = kubeClient.
|
||||||
CoreV1().
|
CoreV1().
|
||||||
Pods(env.ConciergeNamespace).
|
Pods(env.ConciergeNamespace).
|
||||||
Delete(ctx, originalAgentPods.Items[0].Name, metav1.DeleteOptions{})
|
Delete(ctx, podToDelete.Name, metav1.DeleteOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
|
// Make sure the original pods come back.
|
||||||
|
t.Logf("waiting for agent pods to reconcile")
|
||||||
|
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Make sure the pods all become healthy.
|
||||||
|
t.Logf("waiting for agent pods to become running")
|
||||||
|
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("reconcile on unhealthy", func(t *testing.T) {
|
||||||
|
// Refresh this pod so we have its latest UID to compare against later.
|
||||||
|
podToDisrupt := &originalAgentPods.Items[0]
|
||||||
|
podToDisrupt, err = kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, originalAgentPods.Items[0].Name, metav1.GetOptions{})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Exec into the pod and kill the sleep process, which should cause the pod to enter status.phase == "Error".
|
||||||
|
execRequest := kubeClient.
|
||||||
|
CoreV1().
|
||||||
|
RESTClient().
|
||||||
|
Post().
|
||||||
|
Namespace(podToDisrupt.Namespace).
|
||||||
|
Resource("pods").
|
||||||
|
Name(podToDisrupt.Name).
|
||||||
|
SubResource("exec").
|
||||||
|
VersionedParams(&corev1.PodExecOptions{
|
||||||
|
Stdout: true,
|
||||||
|
Stderr: true,
|
||||||
|
Command: []string{"/usr/bin/killall", "sleep"},
|
||||||
|
}, scheme.ParameterCodec)
|
||||||
|
executor, err := remotecommand.NewSPDYExecutor(library.NewClientConfig(t), "POST", execRequest.URL())
|
||||||
|
require.NoError(t, err)
|
||||||
|
t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
|
||||||
|
t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
|
||||||
|
|
||||||
|
// Wait for that pod to be disappear (since it will have failed).
|
||||||
|
t.Logf("waiting for unhealthy agent pod to disappear")
|
||||||
|
library.RequireEventuallyWithoutError(t, func() (bool, error) {
|
||||||
|
currentPod, err := kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, podToDisrupt.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
if k8serrors.IsNotFound(err) {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
if currentPod.UID == podToDisrupt.UID {
|
||||||
|
t.Logf("pod %s/%s still exists in status %s", podToDisrupt.Namespace, podToDisrupt.Name, currentPod.Status.Phase)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}, 10*time.Second, 1*time.Second, "unhealthy agent pod was never deleted")
|
||||||
|
|
||||||
|
t.Logf("waiting for agent pods to reconcile")
|
||||||
// Make sure the original pods come back.
|
// Make sure the original pods come back.
|
||||||
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Make sure the pods all become healthy.
|
||||||
|
t.Logf("waiting for agent pods to become running")
|
||||||
|
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func waitForAllAgentsRunning(t *testing.T, kubeClient kubernetes.Interface, env *library.TestEnv, ctx context.Context) {
|
||||||
|
library.RequireEventuallyWithoutError(t, func() (bool, error) {
|
||||||
|
pods, err := kubeClient.CoreV1().Pods(env.ConciergeNamespace).List(ctx, metav1.ListOptions{
|
||||||
|
LabelSelector: kubeCertAgentLabelSelector,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pods.Items) == 0 {
|
||||||
|
t.Logf("there are no agent pods yet")
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
allRunning := true
|
||||||
|
for _, pod := range pods.Items {
|
||||||
|
t.Logf("agent pod %s/%s is in status %s", pod.Namespace, pod.Name, pod.Status.Phase)
|
||||||
|
if pod.Status.Phase != corev1.PodRunning {
|
||||||
|
allRunning = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return allRunning, nil
|
||||||
|
}, 60*time.Second, 2*time.Second, "agent pods never went back to Running status")
|
||||||
}
|
}
|
||||||
|
|
||||||
func sortPods(pods *corev1.PodList) {
|
func sortPods(pods *corev1.PodList) {
|
||||||
|
Loading…
Reference in New Issue
Block a user