From 5290aac66fa23b8b49dfc3d6bad57fd27fb1f87f Mon Sep 17 00:00:00 2001
From: Matt Moyer <moyerm@vmware.com>
Date: Thu, 22 Apr 2021 14:51:55 -0500
Subject: [PATCH] Adjust "/bin/killall sleep" in new test to be less flaky in
 CI.

Sometimes the container runtime detects the exiting PID 1 very quickly and shuts down the entire container while the `killall` process is still running.
When this happens, we see it as exit code 137 (SIGKILL).

This never failed for me in Kind locally, but fails pretty often in CI (probably due to timing differences).

Signed-off-by: Matt Moyer <moyerm@vmware.com>
---
 test/integration/concierge_kubecertagent_test.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/integration/concierge_kubecertagent_test.go b/test/integration/concierge_kubecertagent_test.go
index 8a01bf45..20274c5d 100644
--- a/test/integration/concierge_kubecertagent_test.go
+++ b/test/integration/concierge_kubecertagent_test.go
@@ -8,6 +8,7 @@ import (
 	"context"
 	"fmt"
 	"sort"
+	"strings"
 	"testing"
 	"time"
 
@@ -167,7 +168,15 @@ func TestKubeCertAgent(t *testing.T) {
 		require.NoError(t, err)
 		t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
 		var stdout, stderr bytes.Buffer
-		require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
+		err = executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr})
+
+		// Some container runtimes (e.g., in CI) exit fast enough that our killall process also gets a SIGKILL.
+		if err != nil && strings.Contains(err.Error(), "command terminated with exit code 137") {
+			t.Logf("ignoring SIGKILL error: %s", err.Error())
+			err = nil
+		}
+
+		require.NoError(t, err)
 		t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
 
 		// Wait for that pod to be disappear (since it will have failed).