Add a test to verify that the kube-cert-agent recovers when a pod becomes unhealthy.
This required some small adjustments to the produciton code to make it more feasible to test. The new test takes an existing agent pod and terminates the `sleep` process, causing the pod to go into an `Error` status. The agent controllers _should_ respond to this by deleting and recreating that failed pod, but the current code just gets stuck. This is meant to replicate the situation when a cluster is suspended and resumed, which also causes the agent pod to be in this terminal error state. Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
parent
4375c01afb
commit
24c8bdef44
@ -25,7 +25,7 @@ RUN mkdir out \
|
||||
|
||||
# Use a runtime image based on Debian slim
|
||||
FROM debian:10.9-slim
|
||||
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
|
||||
RUN apt-get update && apt-get install -y ca-certificates procps && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy the binaries from the build-env stage
|
||||
COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge
|
||||
|
@ -129,7 +129,7 @@ func (c *AgentPodConfig) newAgentPod(controllerManagerPod *corev1.Pod) *corev1.P
|
||||
Name: "sleeper",
|
||||
Image: c.ContainerImage,
|
||||
ImagePullPolicy: corev1.PullIfNotPresent,
|
||||
Command: []string{"/bin/sleep", "infinity"},
|
||||
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
|
||||
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
||||
Resources: corev1.ResourceRequirements{
|
||||
Limits: corev1.ResourceList{
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
||||
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package kubecertagent
|
||||
@ -101,7 +101,7 @@ func exampleControllerManagerAndAgentPods(
|
||||
Image: "some-agent-image",
|
||||
ImagePullPolicy: corev1.PullIfNotPresent,
|
||||
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
|
||||
Command: []string{"/bin/sleep", "infinity"},
|
||||
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
|
||||
Resources: corev1.ResourceRequirements{
|
||||
Limits: corev1.ResourceList{
|
||||
corev1.ResourceMemory: resource.MustParse("16Mi"),
|
||||
|
@ -1,9 +1,10 @@
|
||||
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
|
||||
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
@ -14,8 +15,12 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/equality"
|
||||
k8serrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/diff"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
"k8s.io/client-go/tools/remotecommand"
|
||||
|
||||
"go.pinniped.dev/test/library"
|
||||
)
|
||||
@ -27,11 +32,15 @@ const (
|
||||
func TestKubeCertAgent(t *testing.T) {
|
||||
env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
kubeClient := library.NewClientset(t)
|
||||
|
||||
// Make sure the agent pods are running and healthy before the tests begin.
|
||||
t.Logf("waiting for agent pods to become running before tests")
|
||||
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||
|
||||
// Get the current number of kube-cert-agent pods.
|
||||
//
|
||||
// We can pretty safely assert there should be more than 1, since there should be a
|
||||
@ -98,26 +107,121 @@ func TestKubeCertAgent(t *testing.T) {
|
||||
updatedAgentPod.Spec.Tolerations,
|
||||
corev1.Toleration{Key: "fake-toleration"},
|
||||
)
|
||||
t.Logf("updating agent pod %s/%s with a fake toleration", updatedAgentPod.Namespace, updatedAgentPod.Name)
|
||||
_, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{})
|
||||
require.NoError(t, err)
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Make sure the original pods come back.
|
||||
t.Logf("waiting for agent pods to reconcile")
|
||||
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Make sure the pods all become healthy.
|
||||
t.Logf("waiting for agent pods to become running")
|
||||
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||
})
|
||||
|
||||
t.Run("reconcile on delete", func(t *testing.T) {
|
||||
// Delete the first pod. The controller should see it, and flip it back.
|
||||
podToDelete := originalAgentPods.Items[0]
|
||||
t.Logf("deleting agent pod %s/%s", podToDelete.Namespace, podToDelete.Name)
|
||||
err = kubeClient.
|
||||
CoreV1().
|
||||
Pods(env.ConciergeNamespace).
|
||||
Delete(ctx, originalAgentPods.Items[0].Name, metav1.DeleteOptions{})
|
||||
Delete(ctx, podToDelete.Name, metav1.DeleteOptions{})
|
||||
require.NoError(t, err)
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Make sure the original pods come back.
|
||||
t.Logf("waiting for agent pods to reconcile")
|
||||
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Make sure the pods all become healthy.
|
||||
t.Logf("waiting for agent pods to become running")
|
||||
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||
})
|
||||
|
||||
t.Run("reconcile on unhealthy", func(t *testing.T) {
|
||||
// Refresh this pod so we have its latest UID to compare against later.
|
||||
podToDisrupt := &originalAgentPods.Items[0]
|
||||
podToDisrupt, err = kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, originalAgentPods.Items[0].Name, metav1.GetOptions{})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Exec into the pod and kill the sleep process, which should cause the pod to enter status.phase == "Error".
|
||||
execRequest := kubeClient.
|
||||
CoreV1().
|
||||
RESTClient().
|
||||
Post().
|
||||
Namespace(podToDisrupt.Namespace).
|
||||
Resource("pods").
|
||||
Name(podToDisrupt.Name).
|
||||
SubResource("exec").
|
||||
VersionedParams(&corev1.PodExecOptions{
|
||||
Stdout: true,
|
||||
Stderr: true,
|
||||
Command: []string{"/usr/bin/killall", "sleep"},
|
||||
}, scheme.ParameterCodec)
|
||||
executor, err := remotecommand.NewSPDYExecutor(library.NewClientConfig(t), "POST", execRequest.URL())
|
||||
require.NoError(t, err)
|
||||
t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
|
||||
var stdout, stderr bytes.Buffer
|
||||
require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
|
||||
t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
|
||||
|
||||
// Wait for that pod to be disappear (since it will have failed).
|
||||
t.Logf("waiting for unhealthy agent pod to disappear")
|
||||
library.RequireEventuallyWithoutError(t, func() (bool, error) {
|
||||
currentPod, err := kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, podToDisrupt.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
if k8serrors.IsNotFound(err) {
|
||||
return true, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
if currentPod.UID == podToDisrupt.UID {
|
||||
t.Logf("pod %s/%s still exists in status %s", podToDisrupt.Namespace, podToDisrupt.Name, currentPod.Status.Phase)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}, 10*time.Second, 1*time.Second, "unhealthy agent pod was never deleted")
|
||||
|
||||
t.Logf("waiting for agent pods to reconcile")
|
||||
// Make sure the original pods come back.
|
||||
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Make sure the pods all become healthy.
|
||||
t.Logf("waiting for agent pods to become running")
|
||||
waitForAllAgentsRunning(t, kubeClient, env, ctx)
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
func waitForAllAgentsRunning(t *testing.T, kubeClient kubernetes.Interface, env *library.TestEnv, ctx context.Context) {
|
||||
library.RequireEventuallyWithoutError(t, func() (bool, error) {
|
||||
pods, err := kubeClient.CoreV1().Pods(env.ConciergeNamespace).List(ctx, metav1.ListOptions{
|
||||
LabelSelector: kubeCertAgentLabelSelector,
|
||||
})
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if len(pods.Items) == 0 {
|
||||
t.Logf("there are no agent pods yet")
|
||||
return false, nil
|
||||
}
|
||||
|
||||
allRunning := true
|
||||
for _, pod := range pods.Items {
|
||||
t.Logf("agent pod %s/%s is in status %s", pod.Namespace, pod.Name, pod.Status.Phase)
|
||||
if pod.Status.Phase != corev1.PodRunning {
|
||||
allRunning = false
|
||||
}
|
||||
}
|
||||
return allRunning, nil
|
||||
}, 60*time.Second, 2*time.Second, "agent pods never went back to Running status")
|
||||
}
|
||||
|
||||
func sortPods(pods *corev1.PodList) {
|
||||
|
Loading…
Reference in New Issue
Block a user