Adjust "/bin/killall sleep" in new test to be less flaky in CI.

Sometimes the container runtime detects the exiting PID 1 very quickly and shuts down the entire container while the `killall` process is still running.
When this happens, we see it as exit code 137 (SIGKILL).

This never failed for me in Kind locally, but fails pretty often in CI (probably due to timing differences).

Signed-off-by: Matt Moyer <moyerm@vmware.com>
This commit is contained in:
Matt Moyer 2021-04-22 14:51:55 -05:00
parent 4927f1c1ad
commit 5290aac66f
No known key found for this signature in database
GPG Key ID: EAE88AD172C5AE2D

View File

@ -8,6 +8,7 @@ import (
"context"
"fmt"
"sort"
"strings"
"testing"
"time"
@ -167,7 +168,15 @@ func TestKubeCertAgent(t *testing.T) {
require.NoError(t, err)
t.Logf("execing into agent pod %s/%s to run '/usr/bin/killall sleep'", podToDisrupt.Namespace, podToDisrupt.Name)
var stdout, stderr bytes.Buffer
require.NoError(t, executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}))
err = executor.Stream(remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr})
// Some container runtimes (e.g., in CI) exit fast enough that our killall process also gets a SIGKILL.
if err != nil && strings.Contains(err.Error(), "command terminated with exit code 137") {
t.Logf("ignoring SIGKILL error: %s", err.Error())
err = nil
}
require.NoError(t, err)
t.Logf("'/usr/bin/killall sleep' finished (stdout: %q, stderr: %q)", stdout.String(), stderr.String())
// Wait for that pod to be disappear (since it will have failed).