From 24c8bdef44a4aa50967163579d11ea3184fda234 Mon Sep 17 00:00:00 2001 From: Matt Moyer Date: Wed, 21 Apr 2021 15:32:50 -0500 Subject: [PATCH] Add a test to verify that the kube-cert-agent recovers when a pod becomes unhealthy. This required some small adjustments to the produciton code to make it more feasible to test. The new test takes an existing agent pod and terminates the `sleep` process, causing the pod to go into an `Error` status. The agent controllers _should_ respond to this by deleting and recreating that failed pod, but the current code just gets stuck. This is meant to replicate the situation when a cluster is suspended and resumed, which also causes the agent pod to be in this terminal error state. Signed-off-by: Matt Moyer --- Dockerfile | 2 +- .../controller/kubecertagent/kubecertagent.go | 2 +- .../kubecertagent/kubecertagent_test.go | 4 +- .../concierge_kubecertagent_test.go | 110 +++++++++++++++++- 4 files changed, 111 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index bfdc2dba..2bb465bf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ RUN mkdir out \ # Use a runtime image based on Debian slim FROM debian:10.9-slim -RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y ca-certificates procps && rm -rf /var/lib/apt/lists/* # Copy the binaries from the build-env stage COPY --from=build-env /work/out/pinniped-concierge /usr/local/bin/pinniped-concierge diff --git a/internal/controller/kubecertagent/kubecertagent.go b/internal/controller/kubecertagent/kubecertagent.go index cb32d0b3..9f95abe3 100644 --- a/internal/controller/kubecertagent/kubecertagent.go +++ b/internal/controller/kubecertagent/kubecertagent.go @@ -129,7 +129,7 @@ func (c *AgentPodConfig) newAgentPod(controllerManagerPod *corev1.Pod) *corev1.P Name: "sleeper", Image: c.ContainerImage, ImagePullPolicy: corev1.PullIfNotPresent, - Command: []string{"/bin/sleep", "infinity"}, + Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"}, VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts, Resources: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ diff --git a/internal/controller/kubecertagent/kubecertagent_test.go b/internal/controller/kubecertagent/kubecertagent_test.go index 29bb5955..6ea5f09c 100644 --- a/internal/controller/kubecertagent/kubecertagent_test.go +++ b/internal/controller/kubecertagent/kubecertagent_test.go @@ -1,4 +1,4 @@ -// Copyright 2020 the Pinniped contributors. All Rights Reserved. +// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 package kubecertagent @@ -101,7 +101,7 @@ func exampleControllerManagerAndAgentPods( Image: "some-agent-image", ImagePullPolicy: corev1.PullIfNotPresent, VolumeMounts: controllerManagerPod.Spec.Containers[0].VolumeMounts, - Command: []string{"/bin/sleep", "infinity"}, + Command: []string{"/bin/sh", "-c", "/bin/sleep infinity"}, Resources: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ corev1.ResourceMemory: resource.MustParse("16Mi"), diff --git a/test/integration/concierge_kubecertagent_test.go b/test/integration/concierge_kubecertagent_test.go index 7066dfae..57cf0b20 100644 --- a/test/integration/concierge_kubecertagent_test.go +++ b/test/integration/concierge_kubecertagent_test.go @@ -1,9 +1,10 @@ -// Copyright 2020 the Pinniped contributors. All Rights Reserved. +// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 package integration import ( + "bytes" "context" "fmt" "sort" @@ -14,8 +15,12 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/diff" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/remotecommand" "go.pinniped.dev/test/library" ) @@ -27,11 +32,15 @@ const ( func TestKubeCertAgent(t *testing.T) { env := library.IntegrationEnv(t).WithCapability(library.ClusterSigningKeyIsAvailable) - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() kubeClient := library.NewClientset(t) + // Make sure the agent pods are running and healthy before the tests begin. + t.Logf("waiting for agent pods to become running before tests") + waitForAllAgentsRunning(t, kubeClient, env, ctx) + // Get the current number of kube-cert-agent pods. // // We can pretty safely assert there should be more than 1, since there should be a @@ -98,26 +107,121 @@ func TestKubeCertAgent(t *testing.T) { updatedAgentPod.Spec.Tolerations, corev1.Toleration{Key: "fake-toleration"}, ) + t.Logf("updating agent pod %s/%s with a fake toleration", updatedAgentPod.Namespace, updatedAgentPod.Name) _, err = kubeClient.CoreV1().Pods(env.ConciergeNamespace).Update(ctx, updatedAgentPod, metav1.UpdateOptions{}) require.NoError(t, err) + time.Sleep(1 * time.Second) // Make sure the original pods come back. + t.Logf("waiting for agent pods to reconcile") assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond) require.NoError(t, err) + + // Make sure the pods all become healthy. + t.Logf("waiting for agent pods to become running") + waitForAllAgentsRunning(t, kubeClient, env, ctx) }) t.Run("reconcile on delete", func(t *testing.T) { // Delete the first pod. The controller should see it, and flip it back. + podToDelete := originalAgentPods.Items[0] + t.Logf("deleting agent pod %s/%s", podToDelete.Namespace, podToDelete.Name) err = kubeClient. CoreV1(). Pods(env.ConciergeNamespace). - Delete(ctx, originalAgentPods.Items[0].Name, metav1.DeleteOptions{}) + Delete(ctx, podToDelete.Name, metav1.DeleteOptions{}) + require.NoError(t, err) + time.Sleep(1 * time.Second) + + // Make sure the original pods come back. + t.Logf("waiting for agent pods to reconcile") + assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond) require.NoError(t, err) + // Make sure the pods all become healthy. + t.Logf("waiting for agent pods to become running") + waitForAllAgentsRunning(t, kubeClient, env, ctx) + }) + + t.Run("reconcile on unhealthy", func(t *testing.T) { + // Refresh this pod so we have its latest UID to compare against later. + podToDisrupt := &originalAgentPods.Items[0] + podToDisrupt, err = kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, originalAgentPods.Items[0].Name, metav1.GetOptions{}) + require.NoError(t, err) + + // Exec into the pod and kill the sleep process, which should cause the pod to enter status.phase == "Error". + execRequest := kubeClient. + CoreV1(). + RESTClient(). + Post(). + Namespace(podToDisrupt.Namespace). + Resource("pods"). + Name(podToDisrupt.Name). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Stdout: true, + Stderr: true, + Command: []string{"/usr/bin/killall", "sleep"}, + }, scheme.ParameterCodec) + executor, err := remotecommand.NewSPDYExecutor(library.NewClientConfig(t), "POST", execRequest.URL()) + require.NoError(t, err) + t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name) + var stdout, stderr bytes.Buffer + require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr})) + t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String()) + + // Wait for that pod to be disappear (since it will have failed). + t.Logf("waiting for unhealthy agent pod to disappear") + library.RequireEventuallyWithoutError(t, func() (bool, error) { + currentPod, err := kubeClient.CoreV1().Pods(podToDisrupt.Namespace).Get(ctx, podToDisrupt.Name, metav1.GetOptions{}) + if err != nil { + if k8serrors.IsNotFound(err) { + return true, nil + } + return false, err + } + if currentPod.UID == podToDisrupt.UID { + t.Logf("pod %s/%s still exists in status %s", podToDisrupt.Namespace, podToDisrupt.Name, currentPod.Status.Phase) + return false, nil + } + return true, nil + }, 10*time.Second, 1*time.Second, "unhealthy agent pod was never deleted") + + t.Logf("waiting for agent pods to reconcile") // Make sure the original pods come back. assert.Eventually(t, agentPodsReconciled, 10*time.Second, 250*time.Millisecond) require.NoError(t, err) + + // Make sure the pods all become healthy. + t.Logf("waiting for agent pods to become running") + waitForAllAgentsRunning(t, kubeClient, env, ctx) }) + +} + +func waitForAllAgentsRunning(t *testing.T, kubeClient kubernetes.Interface, env *library.TestEnv, ctx context.Context) { + library.RequireEventuallyWithoutError(t, func() (bool, error) { + pods, err := kubeClient.CoreV1().Pods(env.ConciergeNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: kubeCertAgentLabelSelector, + }) + if err != nil { + return false, err + } + + if len(pods.Items) == 0 { + t.Logf("there are no agent pods yet") + return false, nil + } + + allRunning := true + for _, pod := range pods.Items { + t.Logf("agent pod %s/%s is in status %s", pod.Namespace, pod.Name, pod.Status.Phase) + if pod.Status.Phase != corev1.PodRunning { + allRunning = false + } + } + return allRunning, nil + }, 60*time.Second, 2*time.Second, "agent pods never went back to Running status") } func sortPods(pods *corev1.PodList) {