Adjust "/bin/killall sleep" in new test to be less flaky in CI.

Sometimes the container runtime detects the exiting PID 1 very quickly and shuts down the entire container while the `killall` process is still running. When this happens, we see it as exit code 137 (SIGKILL). This never failed for me in Kind locally, but fails pretty often in CI (probably due to timing differences). Signed-off-by: Matt Moyer <moyerm@vmware.com>
2021-04-22 14:51:55 -05:00 · 2021-04-22 14:51:55 -05:00 · 5290aac66f
commit 5290aac66f
parent 4927f1c1ad
1 changed files with 10 additions and 1 deletions
--- a/test/integration/concierge_kubecertagent_test.go
+++ b/test/integration/concierge_kubecertagent_test.go
@ -8,6 +8,7 @@ import (
 	"context"
 	"fmt"
 	"sort"
+	"strings"
 	"testing"
 	"time"

@ -167,7 +168,15 @@ func TestKubeCertAgent(t *testing.T) {
 		require.NoError(t, err)
 		t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
 		var stdout, stderr bytes.Buffer
-		require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
+		err = executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr})
+
+		// Some container runtimes (e.g., in CI) exit fast enough that our killall process also gets a SIGKILL.
+		if err != nil && strings.Contains(err.Error(), "command terminated with exit code 137") {
+			t.Logf("ignoring SIGKILL error: %s", err.Error())
+			err = nil
+		}
+
+		require.NoError(t, err)
 		t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())

 		// Wait for that pod to be disappear (since it will have failed).