Add a test to verify that the kube-cert-agent recovers when a pod becomes unhealthy.

This required some small adjustments to the produciton code to make it more feasible to test.

The new test takes an existing agent pod and terminates the `sleep` process, causing the pod to go into an `Error` status.
The agent controllers _should_ respond to this by deleting and recreating that failed pod, but the current code just gets stuck.

This is meant to replicate the situation when a cluster is suspended and resumed, which also causes the agent pod to be in this terminal error state.

Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
Matt Moyer 2021-04-21 15:32:50 -05:00
parent 4375c01afb
commit 24c8bdef44
No known key found for this signature in database
GPG Key ID: EAE88AD172C5AE2D
4 changed files with 111 additions and 7 deletions

View File

@ -25,7 +25,7 @@ RUN mkdir out \
# Use a runtime image based on Debian slim
FROM debian:10.9-slim
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y ca-certificates procps && rm -rf /var/lib/apt/lists/*
# Copy the binaries from the build-env stage
COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge

View File

@ -129,7 +129,7 @@ func (c *AgentPodConfig) newAgentPod(controllerManagerPod *corev1.Pod) *corev1.P
Name: "sleeper",
Image: c.ContainerImage,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"/bin/sleep", "infinity"},
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{

View File

@ -1,4 +1,4 @@
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
package kubecertagent
@ -101,7 +101,7 @@ func exampleControllerManagerAndAgentPods(
Image: "some-agent-image",
ImagePullPolicy: corev1.PullIfNotPresent,
VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts,
Command: []string{"/bin/sleep", "infinity"},
Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("16Mi"),

View File

@ -1,9 +1,10 @@
// Copyright 2020 the Pinniped contributors. All Rights Reserved.
// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
package integration
import (
"bytes"
"context"
"fmt"
"sort"
@ -14,8 +15,12 @@ import (
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/diff"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/remotecommand"
"go.pinniped.dev/test/library"
)
@ -27,11 +32,15 @@ const (
func TestKubeCertAgent(t *testing.T) {
env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable)
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
kubeClient := library.NewClientset(t)
// Make sure the agent pods are running and healthy before the tests begin.
t.Logf("waiting for agent pods to become running before tests")
waitForAllAgentsRunning(t, kubeClient, env, ctx)
// Get the current number of kube-cert-agent pods.
//
// We can pretty safely assert there should be more than 1, since there should be a
@ -98,26 +107,121 @@ func TestKubeCertAgent(t *testing.T) {
updatedAgentPod.Spec.Tolerations,
corev1.Toleration{Key: "fake-toleration"},
)
t.Logf("updating agent pod %s/%s with a fake toleration", updatedAgentPod.Namespace, updatedAgentPod.Name)
_, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{})
require.NoError(t, err)
time.Sleep(1 * time.Second)
// Make sure the original pods come back.
t.Logf("waiting for agent pods to reconcile")
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
require.NoError(t, err)
// Make sure the pods all become healthy.
t.Logf("waiting for agent pods to become running")
waitForAllAgentsRunning(t, kubeClient, env, ctx)
})
t.Run("reconcile on delete", func(t *testing.T) {
// Delete the first pod. The controller should see it, and flip it back.
podToDelete := originalAgentPods.Items[0]
t.Logf("deleting agent pod %s/%s", podToDelete.Namespace, podToDelete.Name)
err = kubeClient.
CoreV1().
Pods(env.ConciergeNamespace).
Delete(ctx, originalAgentPods.Items[0].Name, metav1.DeleteOptions{})
Delete(ctx, podToDelete.Name, metav1.DeleteOptions{})
require.NoError(t, err)
time.Sleep(1 * time.Second)
// Make sure the original pods come back.
t.Logf("waiting for agent pods to reconcile")
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
require.NoError(t, err)
// Make sure the pods all become healthy.
t.Logf("waiting for agent pods to become running")
waitForAllAgentsRunning(t, kubeClient, env, ctx)
})
t.Run("reconcile on unhealthy", func(t *testing.T) {
// Refresh this pod so we have its latest UID to compare against later.
podToDisrupt := &originalAgentPods.Items[0]
podToDisrupt, err = kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, originalAgentPods.Items[0].Name, metav1.GetOptions{})
require.NoError(t, err)
// Exec into the pod and kill the sleep process, which should cause the pod to enter status.phase == "Error".
execRequest := kubeClient.
CoreV1().
RESTClient().
Post().
Namespace(podToDisrupt.Namespace).
Resource("pods").
Name(podToDisrupt.Name).
SubResource("exec").
VersionedParams(&corev1.PodExecOptions{
Stdout: true,
Stderr: true,
Command: []string{"/usr/bin/killall", "sleep"},
}, scheme.ParameterCodec)
executor, err := remotecommand.NewSPDYExecutor(library.NewClientConfig(t), "POST", execRequest.URL())
require.NoError(t, err)
t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
var stdout, stderr bytes.Buffer
require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
// Wait for that pod to be disappear (since it will have failed).
t.Logf("waiting for unhealthy agent pod to disappear")
library.RequireEventuallyWithoutError(t, func() (bool, error) {
currentPod, err := kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, podToDisrupt.Name, metav1.GetOptions{})
if err != nil {
if k8serrors.IsNotFound(err) {
return true, nil
}
return false, err
}
if currentPod.UID == podToDisrupt.UID {
t.Logf("pod %s/%s still exists in status %s", podToDisrupt.Namespace, podToDisrupt.Name, currentPod.Status.Phase)
return false, nil
}
return true, nil
}, 10*time.Second, 1*time.Second, "unhealthy agent pod was never deleted")
t.Logf("waiting for agent pods to reconcile")
// Make sure the original pods come back.
assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
require.NoError(t, err)
// Make sure the pods all become healthy.
t.Logf("waiting for agent pods to become running")
waitForAllAgentsRunning(t, kubeClient, env, ctx)
})
}
func waitForAllAgentsRunning(t *testing.T, kubeClient kubernetes.Interface, env *library.TestEnv, ctx context.Context) {
library.RequireEventuallyWithoutError(t, func() (bool, error) {
pods, err := kubeClient.CoreV1().Pods(env.ConciergeNamespace).List(ctx, metav1.ListOptions{
LabelSelector: kubeCertAgentLabelSelector,
})
if err != nil {
return false, err
}
if len(pods.Items) == 0 {
t.Logf("there are no agent pods yet")
return false, nil
}
allRunning := true
for _, pod := range pods.Items {
t.Logf("agent pod %s/%s is in status %s", pod.Namespace, pod.Name, pod.Status.Phase)
if pod.Status.Phase != corev1.PodRunning {
allRunning = false
}
}
return allRunning, nil
}, 60*time.Second, 2*time.Second, "agent pods never went back to Running status")
}
func sortPods(pods *corev1.PodList) {