Add a test to verify that the kube-cert-agent recovers when a pod becomes unhealthy.

This required some small adjustments to the produciton code to make it more feasible to test. The new test takes an existing agent pod and terminates the `sleep` process, causing the pod to go into an `Error` status. The agent controllers _should_ respond to this by deleting and recreating that failed pod, but the current code just gets stuck. This is meant to replicate the situation when a cluster is suspended and resumed, which also causes the agent pod to be in this terminal error state. Signed-off-by: Matt Moyer <moyerm@vmware.com>
2021-04-21 15:32:50 -05:00 · 2021-04-21 15:32:50 -05:00 · 24c8bdef44
commit 24c8bdef44
parent 4375c01afb
4 changed files with 111 additions and 7 deletions
--- a/2
+++ b/2
@ -25,7 +25,7 @@ RUN mkdir out \

 # Use a runtime image based on Debian slim
 FROM debian:10.9-slim
-RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y ca-certificates procps && rm -rf /var/lib/apt/lists/*

 # Copy the binaries from the build-env stage
 COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge
--- a/internal/controller/kubecertagent/kubecertagent.go
+++ b/internal/controller/kubecertagent/kubecertagent.go
@ -129,7 +129,7 @@ func (c *AgentPodConfig) newAgentPod(controllerManagerPod *corev1.Pod) *corev1.P
 					Name:            "sleeper",
 					Image:           c.ContainerImage,
 					ImagePullPolicy: corev1.PullIfNotPresent,
-					Command:         []string{"/bin/sleep", "infinity"},
+					Command:         []string{"/bin/sh", "-c", "/bin/sleep infinity"},
 					VolumeMounts:    controllerManagerPod.Spec.Containers[0].VolumeMounts,
 					Resources: corev1.ResourceRequirements{
 						Limits: corev1.ResourceList{
--- a/internal/controller/kubecertagent/kubecertagent_test.go
+++ b/internal/controller/kubecertagent/kubecertagent_test.go
@ -1,4 +1,4 @@
-// Copyright 2020 the Pinniped contributors. All Rights Reserved.
+// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0

 package kubecertagent
@ -101,7 +101,7 @@ func exampleControllerManagerAndAgentPods(
 					Image:           "some-agent-image",
 					ImagePullPolicy: corev1.PullIfNotPresent,
 					VolumeMounts:    controllerManagerPod.Spec.Containers[0].VolumeMounts,
-					Command:         []string{"/bin/sleep", "infinity"},
+					Command:         []string{"/bin/sh", "-c", "/bin/sleep infinity"},
 					Resources: corev1.ResourceRequirements{
 						Limits: corev1.ResourceList{
 							corev1.ResourceMemory: resource.MustParse("16Mi"),
--- a/test/integration/concierge_kubecertagent_test.go
+++ b/test/integration/concierge_kubecertagent_test.go
@ -1,9 +1,10 @@
-// Copyright 2020 the Pinniped contributors. All Rights Reserved.
+// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0

 package integration

 import (
+	"bytes"
 	"context"
 	"fmt"
 	"sort"
@ -14,8 +15,12 @@ import (
 	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/equality"
+	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/diff"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/tools/remotecommand"

 	"go.pinniped.dev/test/library"
 )
@ -27,11 +32,15 @@ const (
 func TestKubeCertAgent(t *testing.T) {
 	env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable)

-	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()

 	kubeClient := library.NewClientset(t)

+	// Make sure the agent pods are running and healthy before the tests begin.
+	t.Logf("waiting for agent pods to become running before tests")
+	waitForAllAgentsRunning(t, kubeClient, env, ctx)
+
 	// Get the current number of kube-cert-agent pods.
 	//
 	// We can pretty safely assert there should be more than 1, since there should be a
@ -98,26 +107,121 @@ func TestKubeCertAgent(t *testing.T) {
 			updatedAgentPod.Spec.Tolerations,
 			corev1.Toleration{Key: "fake-toleration"},
 		)
+		t.Logf("updating agent pod %s/%s with a fake toleration", updatedAgentPod.Namespace, updatedAgentPod.Name)
 		_, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{})
 		require.NoError(t, err)
+		time.Sleep(1 * time.Second)

 		// Make sure the original pods come back.
+		t.Logf("waiting for agent pods to reconcile")
 		assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
 		require.NoError(t, err)
+
+		// Make sure the pods all become healthy.
+		t.Logf("waiting for agent pods to become running")
+		waitForAllAgentsRunning(t, kubeClient, env, ctx)
 	})

 	t.Run("reconcile on delete", func(t *testing.T) {
 		// Delete the first pod. The controller should see it, and flip it back.
+		podToDelete := originalAgentPods.Items[0]
+		t.Logf("deleting agent pod %s/%s", podToDelete.Namespace, podToDelete.Name)
 		err = kubeClient.
 			CoreV1().
 			Pods(env.ConciergeNamespace).
-			Delete(ctx, originalAgentPods.Items[0].Name, metav1.DeleteOptions{})
+			Delete(ctx, podToDelete.Name, metav1.DeleteOptions{})
+		require.NoError(t, err)
+		time.Sleep(1 * time.Second)
+
+		// Make sure the original pods come back.
+		t.Logf("waiting for agent pods to reconcile")
+		assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
 		require.NoError(t, err)

+		// Make sure the pods all become healthy.
+		t.Logf("waiting for agent pods to become running")
+		waitForAllAgentsRunning(t, kubeClient, env, ctx)
+	})
+
+	t.Run("reconcile on unhealthy", func(t *testing.T) {
+		// Refresh this pod so we have its latest UID to compare against later.
+		podToDisrupt := &originalAgentPods.Items[0]
+		podToDisrupt, err = kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, originalAgentPods.Items[0].Name, metav1.GetOptions{})
+		require.NoError(t, err)
+
+		// Exec into the pod and kill the sleep process, which should cause the pod to enter status.phase == "Error".
+		execRequest := kubeClient.
+			CoreV1().
+			RESTClient().
+			Post().
+			Namespace(podToDisrupt.Namespace).
+			Resource("pods").
+			Name(podToDisrupt.Name).
+			SubResource("exec").
+			VersionedParams(&corev1.PodExecOptions{
+				Stdout:  true,
+				Stderr:  true,
+				Command: []string{"/usr/bin/killall", "sleep"},
+			}, scheme.ParameterCodec)
+		executor, err := remotecommand.NewSPDYExecutor(library.NewClientConfig(t), "POST", execRequest.URL())
+		require.NoError(t, err)
+		t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
+		var stdout, stderr bytes.Buffer
+		require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
+		t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
+
+		// Wait for that pod to be disappear (since it will have failed).
+		t.Logf("waiting for unhealthy agent pod to disappear")
+		library.RequireEventuallyWithoutError(t, func() (bool, error) {
+			currentPod, err := kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, podToDisrupt.Name, metav1.GetOptions{})
+			if err != nil {
+				if k8serrors.IsNotFound(err) {
+					return true, nil
+				}
+				return false, err
+			}
+			if currentPod.UID == podToDisrupt.UID {
+				t.Logf("pod %s/%s still exists in status %s", podToDisrupt.Namespace, podToDisrupt.Name, currentPod.Status.Phase)
+				return false, nil
+			}
+			return true, nil
+		}, 10*time.Second, 1*time.Second, "unhealthy agent pod was never deleted")
+
+		t.Logf("waiting for agent pods to reconcile")
 		// Make sure the original pods come back.
 		assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond)
 		require.NoError(t, err)
+
+		// Make sure the pods all become healthy.
+		t.Logf("waiting for agent pods to become running")
+		waitForAllAgentsRunning(t, kubeClient, env, ctx)
 	})
+
+}
+
+func waitForAllAgentsRunning(t *testing.T, kubeClient kubernetes.Interface, env *library.TestEnv, ctx context.Context) {
+	library.RequireEventuallyWithoutError(t, func() (bool, error) {
+		pods, err := kubeClient.CoreV1().Pods(env.ConciergeNamespace).List(ctx, metav1.ListOptions{
+			LabelSelector: kubeCertAgentLabelSelector,
+		})
+		if err != nil {
+			return false, err
+		}
+
+		if len(pods.Items) == 0 {
+			t.Logf("there are no agent pods yet")
+			return false, nil
+		}
+
+		allRunning := true
+		for _, pod := range pods.Items {
+			t.Logf("agent pod %s/%s is in status %s", pod.Namespace, pod.Name, pod.Status.Phase)
+			if pod.Status.Phase != corev1.PodRunning {
+				allRunning = false
+			}
+		}
+		return allRunning, nil
+	}, 60*time.Second, 2*time.Second, "agent pods never went back to Running status")
 }

 func sortPods(pods *corev1.PodList) {