diff --git a/hack/prepare-for-integration-tests.sh b/hack/prepare-for-integration-tests.sh index 25581795..1cbd8ca1 100755 --- a/hack/prepare-for-integration-tests.sh +++ b/hack/prepare-for-integration-tests.sh @@ -380,6 +380,8 @@ export PINNIPED_TEST_SUPERVISOR_UPSTREAM_OIDC_USERNAME=pinny@example.com export PINNIPED_TEST_SUPERVISOR_UPSTREAM_OIDC_PASSWORD=${dex_test_password} export PINNIPED_TEST_SUPERVISOR_UPSTREAM_OIDC_EXPECTED_GROUPS= # Dex's local user store does not let us configure groups. export PINNIPED_TEST_API_GROUP_SUFFIX='${api_group_suffix}' +# PINNIPED_TEST_SHELL_CONTAINER_IMAGE should be a container which includes bash and sleep, used by some tests. +export PINNIPED_TEST_SHELL_CONTAINER_IMAGE="ghcr.io/pinniped-ci-bot/test-kubectl:latest" # We can't set up an in-cluster active directory instance, but # if you have an active directory instance that you wish to run the tests against, diff --git a/internal/controllerlib/test/integration/examplecontroller_test.go b/internal/controllerlib/test/integration/examplecontroller_test.go index 695b8072..93bf7a0e 100644 --- a/internal/controllerlib/test/integration/examplecontroller_test.go +++ b/internal/controllerlib/test/integration/examplecontroller_test.go @@ -20,7 +20,7 @@ import ( ) func TestExampleController(t *testing.T) { - testlib.SkipUnlessIntegration(t) + _ = testlib.IntegrationEnv(t) config := testlib.NewClientConfig(t) diff --git a/internal/leaderelection/leaderelection.go b/internal/leaderelection/leaderelection.go index bb17c3de..babd5450 100644 --- a/internal/leaderelection/leaderelection.go +++ b/internal/leaderelection/leaderelection.go @@ -11,6 +11,7 @@ import ( "go.uber.org/atomic" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" @@ -43,47 +44,12 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec return nil, nil, fmt.Errorf("could not create internal client for leader election: %w", err) } - isLeader := atomic.NewBool(false) + isLeader := &isLeaderTracker{tracker: atomic.NewBool(false)} identity := podInfo.Name leaseName := deployment.Name - leaderElectionConfig := leaderelection.LeaderElectionConfig{ - Lock: &resourcelock.LeaseLock{ - LeaseMeta: metav1.ObjectMeta{ - Namespace: podInfo.Namespace, - Name: leaseName, - }, - Client: internalClient.Kubernetes.CoordinationV1(), - LockConfig: resourcelock.ResourceLockConfig{ - Identity: identity, - }, - }, - ReleaseOnCancel: true, // semantics for correct release handled by controllersWithLeaderElector below - LeaseDuration: 60 * time.Second, - RenewDeadline: 15 * time.Second, - RetryPeriod: 5 * time.Second, - Callbacks: leaderelection.LeaderCallbacks{ - OnStartedLeading: func(_ context.Context) { - plog.Debug("leader gained", "identity", identity) - isLeader.Store(true) - }, - OnStoppedLeading: func() { - plog.Debug("leader lost", "identity", identity) - isLeader.Store(false) - }, - OnNewLeader: func(newLeader string) { - if newLeader == identity { - return - } - plog.Debug("new leader elected", "newLeader", newLeader) - }, - }, - Name: leaseName, - // this must be set to nil because we do not want to associate /healthz with a failed - // leader election renewal as we do not want to exit the process if the leader changes. - WatchDog: nil, - } + leaderElectionConfig := newLeaderElectionConfig(podInfo.Namespace, leaseName, identity, internalClient.Kubernetes, isLeader) // validate our config here before we rely on it being functioning below if _, err := leaderelection.NewLeaderElector(leaderElectionConfig); err != nil { @@ -103,7 +69,7 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec return } - if isLeader.Load() { // only perform "expensive" test for writes + if isLeader.canWrite() { // only perform "expensive" test for writes return // we are currently the leader, all actions are permitted } @@ -127,7 +93,11 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec leaderElectorCtx, leaderElectorCancel := context.WithCancel(context.Background()) // purposefully detached context go func() { - controllers(ctx) // run the controllers with the global context, this blocks until the context is canceled + controllers(ctx) // run the controllers with the global context, this blocks until the context is canceled + + if isLeader.stop() { // remove our in-memory leader status before we release the lock + plog.Debug("leader lost", "identity", identity, "reason", "controller stop") + } leaderElectorCancel() // once the controllers have all stopped, tell the leader elector to release the lock }() @@ -148,3 +118,126 @@ func New(podInfo *downward.PodInfo, deployment *appsv1.Deployment, opts ...kubec return client, controllersWithLeaderElector, nil } + +func newLeaderElectionConfig(namespace, leaseName, identity string, internalClient kubernetes.Interface, isLeader *isLeaderTracker) leaderelection.LeaderElectionConfig { + return leaderelection.LeaderElectionConfig{ + Lock: &releaseLock{ + delegate: &resourcelock.LeaseLock{ + LeaseMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: leaseName, + }, + Client: internalClient.CoordinationV1(), + LockConfig: resourcelock.ResourceLockConfig{ + Identity: identity, + }, + }, + isLeader: isLeader, + identity: identity, + }, + ReleaseOnCancel: true, // semantics for correct release handled by releaseLock.Update and controllersWithLeaderElector below + + // Copied from defaults used in OpenShift since we want the same semantics: + // https://github.com/openshift/library-go/blob/e14e06ba8d476429b10cc6f6c0fcfe6ea4f2c591/pkg/config/leaderelection/leaderelection.go#L87-L109 + LeaseDuration: 137 * time.Second, + RenewDeadline: 107 * time.Second, + RetryPeriod: 26 * time.Second, + + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(_ context.Context) { + plog.Debug("leader gained", "identity", identity) + isLeader.start() + }, + OnStoppedLeading: func() { + if isLeader.stop() { // barring changes to client-go, this branch should only be taken on a panic + plog.Debug("leader lost", "identity", identity, "reason", "on stop") + } + }, + OnNewLeader: func(newLeader string) { + if newLeader == identity { + return + } + plog.Debug("new leader elected", "newLeader", newLeader) + }, + }, + Name: leaseName, + // this must be set to nil because we do not want to associate /healthz with a failed + // leader election renewal as we do not want to exit the process if the leader changes. + WatchDog: nil, + } +} + +type isLeaderTracker struct { + tracker *atomic.Bool +} + +func (t *isLeaderTracker) canWrite() bool { + return t.tracker.Load() +} + +func (t *isLeaderTracker) start() { + t.tracker.Store(true) +} + +func (t *isLeaderTracker) stop() (didStop bool) { + return t.tracker.CAS(true, false) +} + +// note that resourcelock.Interface is an internal, unstable interface. +// so while it would be convenient to embed the implementation within +// this struct, we need to make sure our Update override is used and +// that no other methods are added that change the meaning of the +// interface. thus we must have ~20 lines of boilerplate to have the +// compiler ensure that we keep up with this interface over time. +var _ resourcelock.Interface = &releaseLock{} + +// releaseLock works around a limitation of the client-go leader election code: +// there is no "BeforeRelease" callback. By the time the "OnStoppedLeading" +// callback runs (this callback is meant to always run at the very end since it +// normally terminates the process), we have already released the lock. This +// creates a race condition in between the release call (the Update func) and the +// stop callback where a different client could acquire the lease while we still +// believe that we hold the lease in our in-memory leader status. +type releaseLock struct { + delegate resourcelock.Interface // do not embed this, see comment above + isLeader *isLeaderTracker + identity string +} + +func (r *releaseLock) Update(ctx context.Context, ler resourcelock.LeaderElectionRecord) error { + // setting an empty HolderIdentity on update means that the client is releasing the lock. + // thus we need to make sure to update our in-memory leader status before this occurs + // since other clients could immediately acquire the lock. note that even if the Update + // call below fails, this client has already chosen to release the lock and thus we must + // update the in-memory status regardless of it we succeed in making the Kube API call. + // note that while resourcelock.Interface is an unstable interface, the meaning of an + // empty HolderIdentity is encoded into the Kube API and thus we can safely rely on that + // not changing (since changing that would break older clients). + if len(ler.HolderIdentity) == 0 && r.isLeader.stop() { + plog.Debug("leader lost", "identity", r.identity, "reason", "release") + } + + return r.delegate.Update(ctx, ler) +} + +// boilerplate passthrough methods below + +func (r *releaseLock) Get(ctx context.Context) (*resourcelock.LeaderElectionRecord, []byte, error) { + return r.delegate.Get(ctx) +} + +func (r *releaseLock) Create(ctx context.Context, ler resourcelock.LeaderElectionRecord) error { + return r.delegate.Create(ctx, ler) +} + +func (r *releaseLock) RecordEvent(s string) { + r.delegate.RecordEvent(s) +} + +func (r *releaseLock) Identity() string { + return r.delegate.Identity() +} + +func (r *releaseLock) Describe() string { + return r.delegate.Describe() +} diff --git a/internal/leaderelection/leaderelection_test.go b/internal/leaderelection/leaderelection_test.go new file mode 100644 index 00000000..8446614e --- /dev/null +++ b/internal/leaderelection/leaderelection_test.go @@ -0,0 +1,83 @@ +// Copyright 2021 the Pinniped contributors. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package leaderelection + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.uber.org/atomic" + coordinationv1 "k8s.io/api/coordination/v1" + "k8s.io/apimachinery/pkg/runtime" + kubefake "k8s.io/client-go/kubernetes/fake" + kubetesting "k8s.io/client-go/testing" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/utils/pointer" +) + +// see test/integration/leaderelection_test.go for the bulk of the testing related to this code + +func Test_releaseLock_Update(t *testing.T) { + tests := []struct { + name string + f func(t *testing.T, internalClient *kubefake.Clientset, isLeader *isLeaderTracker, cancel context.CancelFunc) + }{ + { + name: "renewal fails on update", + f: func(t *testing.T, internalClient *kubefake.Clientset, isLeader *isLeaderTracker, cancel context.CancelFunc) { + internalClient.PrependReactor("update", "*", func(action kubetesting.Action) (handled bool, ret runtime.Object, err error) { + lease := action.(kubetesting.UpdateAction).GetObject().(*coordinationv1.Lease) + if len(pointer.StringDeref(lease.Spec.HolderIdentity, "")) == 0 { + require.False(t, isLeader.canWrite(), "client must release in-memory leader status before Kube API call") + } + return true, nil, errors.New("cannot renew") + }) + }, + }, + { + name: "renewal fails due to context", + f: func(t *testing.T, internalClient *kubefake.Clientset, isLeader *isLeaderTracker, cancel context.CancelFunc) { + t.Cleanup(func() { + require.False(t, isLeader.canWrite(), "client must release in-memory leader status when context is canceled") + }) + start := time.Now() + internalClient.PrependReactor("update", "*", func(action kubetesting.Action) (handled bool, ret runtime.Object, err error) { + // keep going for a bit + if time.Since(start) < 5*time.Second { + return false, nil, nil + } + + cancel() + return false, nil, nil + }) + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + internalClient := kubefake.NewSimpleClientset() + isLeader := &isLeaderTracker{tracker: atomic.NewBool(false)} + + leaderElectorCtx, cancel := context.WithCancel(context.Background()) + + tt.f(t, internalClient, isLeader, cancel) + + leaderElectionConfig := newLeaderElectionConfig("ns-001", "lease-001", "foo-001", internalClient, isLeader) + + // make the tests run quicker + leaderElectionConfig.LeaseDuration = 2 * time.Second + leaderElectionConfig.RenewDeadline = 1 * time.Second + leaderElectionConfig.RetryPeriod = 250 * time.Millisecond + + // note that this will block until it exits on its own or tt.f calls cancel() + leaderelection.RunOrDie(leaderElectorCtx, leaderElectionConfig) + }) + } +} diff --git a/site/content/docs/howto/install-concierge.md b/site/content/docs/howto/install-concierge.md index 5c8afd31..b6ad47e7 100644 --- a/site/content/docs/howto/install-concierge.md +++ b/site/content/docs/howto/install-concierge.md @@ -57,19 +57,38 @@ Pinniped uses [ytt](https://carvel.dev/ytt/) from [Carvel](https://carvel.dev/) 1. Customize configuration parameters: - - Edit `values.yaml` with your custom values. - - Change the `image_tag` value to match your preferred version tag, e.g. `{{< latestversion >}}`. - - See the [default values](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/concierge/values.yaml) for documentation about individual configuration parameters. + - See the [default values](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/concierge/values.yaml) for documentation about individual configuration parameters. + For example, you can change the number of Concierge pods by setting `replicas` or apply custom annotations to the impersonation proxy service using `impersonation_proxy_spec`. - For example, you can change the number of Concierge pods by setting `replicas` or apply custom annotations to the impersonation proxy service using `impersonation_proxy_spec`. + - In a different directory, create a new YAML file to contain your site-specific configuration. For example, you might call this file `site/dev-env.yaml`. + + In the file, add the special ytt comment for a values file and the YAML triple-dash which starts a new YAML document. + Then add custom overrides for any of the parameters from [`values.yaml`](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/concierge/values.yaml). + + Override the `image_tag` value to match your preferred version tag, e.g. `{{< latestversion >}}`, + to ensure that you use the version of the server which matches these templates. + + Here is an example which overrides the image tag, the default logging level, and the number of replicas: + ```yaml + #@data/values + --- + image_tag: {{< latestversion >}} + log_level: debug + replicas: 1 + ``` + - Parameters for which you would like to use the default value should be excluded from this file. + + - If you are using a GitOps-style workflow to manage the installation of Pinniped, then you may wish to commit this new YAML file to your GitOps repository. 1. Render templated YAML manifests: - - `ytt --file .` + - `ytt --file . --file site/dev-env.yaml` + + By putting the override file last in the list of `--file` options, it will override the default values. 1. Deploy the templated YAML manifests: - - `ytt --file . | kapp deploy --app pinniped-concierge --file -` + - `ytt --file . --file site/dev-env.yaml | kapp deploy --app pinniped-concierge --file -` ## Next steps diff --git a/site/content/docs/howto/install-supervisor.md b/site/content/docs/howto/install-supervisor.md index d2ca21e0..9ee5491d 100644 --- a/site/content/docs/howto/install-supervisor.md +++ b/site/content/docs/howto/install-supervisor.md @@ -49,17 +49,38 @@ Pinniped uses [ytt](https://carvel.dev/ytt/) from [Carvel](https://carvel.dev/) 1. Customize configuration parameters: - - Edit `values.yaml` with your custom values. - - Change the `image_tag` value to match your preferred version tag, e.g. `{{< latestversion >}}`. - - See the [default values](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/supervisor/values.yaml) for documentation about individual configuration parameters. + - See the [default values](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/supervisor/values.yaml) for documentation about individual configuration parameters. + For example, you can change the number of Concierge pods by setting `replicas` or apply custom annotations to the impersonation proxy service using `impersonation_proxy_spec`. + + - In a different directory, create a new YAML file to contain your site-specific configuration. For example, you might call this file `site/dev-env.yaml`. + + In the file, add the special ytt comment for a values file and the YAML triple-dash which starts a new YAML document. + Then add custom overrides for any of the parameters from [`values.yaml`](http://github.com/vmware-tanzu/pinniped/tree/main/deploy/supervisor/values.yaml). + + Override the `image_tag` value to match your preferred version tag, e.g. `{{< latestversion >}}`, + to ensure that you use the version of the server which matches these templates. + + Here is an example which overrides the image tag, the default logging level, and the number of replicas: + ```yaml + #@data/values + --- + image_tag: {{< latestversion >}} + log_level: debug + replicas: 1 + ``` + - Parameters for which you would like to use the default value should be excluded from this file. + + - If you are using a GitOps-style workflow to manage the installation of Pinniped, then you may wish to commit this new YAML file to your GitOps repository. 1. Render templated YAML manifests: - - `ytt --file .` + - `ytt --file . --file site/dev-env.yaml` + + By putting the override file last in the list of `--file` options, it will override the default values. 1. Deploy the templated YAML manifests: - `ytt --file . | kapp deploy --app pinniped-supervisor --file -` + `ytt --file . --file site/dev-env.yaml | kapp deploy --app pinniped-supervisor --file -` ## Next steps diff --git a/test/integration/cli_test.go b/test/integration/cli_test.go index ea710e92..26ba36bd 100644 --- a/test/integration/cli_test.go +++ b/test/integration/cli_test.go @@ -37,7 +37,8 @@ import ( "go.pinniped.dev/test/testlib/browsertest" ) -func TestCLIGetKubeconfigStaticToken(t *testing.T) { +// safe to run in parallel with serial tests since it only interacts with a test local webhook, see main_test.go. +func TestCLIGetKubeconfigStaticToken_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) // Create a test webhook configuration to use with the CLI. diff --git a/test/integration/concierge_api_serving_certs_test.go b/test/integration/concierge_api_serving_certs_test.go index 86267a39..67d06296 100644 --- a/test/integration/concierge_api_serving_certs_test.go +++ b/test/integration/concierge_api_serving_certs_test.go @@ -17,7 +17,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestAPIServingCertificateAutoCreationAndRotation(t *testing.T) { +// Never run this test in parallel since breaking discovery is disruptive, see main_test.go. +func TestAPIServingCertificateAutoCreationAndRotation_Disruptive(t *testing.T) { env := testlib.IntegrationEnv(t) defaultServingCertResourceName := env.ConciergeAppName + "-api-tls-serving-certificate" diff --git a/test/integration/concierge_credentialrequest_test.go b/test/integration/concierge_credentialrequest_test.go index 61782de3..30a771e7 100644 --- a/test/integration/concierge_credentialrequest_test.go +++ b/test/integration/concierge_credentialrequest_test.go @@ -22,7 +22,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestUnsuccessfulCredentialRequest(t *testing.T) { +// TCRs are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestUnsuccessfulCredentialRequest_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t).WithCapability(testlib.AnonymousAuthenticationSupported) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) @@ -44,7 +45,8 @@ func TestUnsuccessfulCredentialRequest(t *testing.T) { require.Equal(t, "authentication failed", *response.Status.Message) } -func TestSuccessfulCredentialRequest(t *testing.T) { +// TCRs are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestSuccessfulCredentialRequest_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) @@ -129,7 +131,8 @@ func TestSuccessfulCredentialRequest(t *testing.T) { } } -func TestFailedCredentialRequestWhenTheRequestIsValidButTheTokenDoesNotAuthenticateTheUser(t *testing.T) { +// TCRs are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestFailedCredentialRequestWhenTheRequestIsValidButTheTokenDoesNotAuthenticateTheUser_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) // Create a testWebhook so we have a legitimate authenticator to pass to the @@ -149,7 +152,8 @@ func TestFailedCredentialRequestWhenTheRequestIsValidButTheTokenDoesNotAuthentic require.Equal(t, pointer.StringPtr("authentication failed"), response.Status.Message) } -func TestCredentialRequest_ShouldFailWhenRequestDoesNotIncludeToken(t *testing.T) { +// TCRs are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestCredentialRequest_ShouldFailWhenRequestDoesNotIncludeToken_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) // Create a testWebhook so we have a legitimate authenticator to pass to the diff --git a/test/integration/concierge_impersonation_proxy_test.go b/test/integration/concierge_impersonation_proxy_test.go index 35c09234..5c3b0994 100644 --- a/test/integration/concierge_impersonation_proxy_test.go +++ b/test/integration/concierge_impersonation_proxy_test.go @@ -948,9 +948,10 @@ func TestImpersonationProxy(t *testing.T) { //nolint:gocyclo // yeah, it's compl corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "ignored-but-required", - Image: "busybox", - Command: []string{"sh", "-c", "sleep 3600"}, + Name: "sleeper", + Image: env.ShellContainerImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"sh", "-c", "sleep 3600"}, }, }, ServiceAccountName: saName, @@ -1064,7 +1065,7 @@ func TestImpersonationProxy(t *testing.T) { //nolint:gocyclo // yeah, it's compl // existing Concierge pod because we need more tools than we can get from a scratch/distroless base image. runningTestPod := testlib.CreatePod(ctx, t, "impersonation-proxy", env.ConciergeNamespace, corev1.PodSpec{Containers: []corev1.Container{{ Name: "impersonation-proxy-test", - Image: "debian:10.10-slim", + Image: env.ShellContainerImage, ImagePullPolicy: corev1.PullIfNotPresent, Command: []string{"bash", "-c", `while true; do read VAR; echo "VAR: $VAR"; done`}, Stdin: true, diff --git a/test/integration/concierge_kubecertagent_test.go b/test/integration/concierge_kubecertagent_test.go index 4a756f94..e44ad68e 100644 --- a/test/integration/concierge_kubecertagent_test.go +++ b/test/integration/concierge_kubecertagent_test.go @@ -93,7 +93,8 @@ func findSuccessfulStrategy(credentialIssuer *conciergev1alpha.CredentialIssuer, return nil } -func TestLegacyPodCleaner(t *testing.T) { +// safe to run in parallel with serial tests since it only interacts with a test local pod, see main_test.go. +func TestLegacyPodCleaner_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() @@ -117,9 +118,10 @@ func TestLegacyPodCleaner(t *testing.T) { }, Spec: corev1.PodSpec{ Containers: []corev1.Container{{ - Name: "sleeper", - Image: "debian:10.9-slim", - Command: []string{"/bin/sleep", "infinity"}, + Name: "sleeper", + Image: env.ShellContainerImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sleep", "infinity"}, }}, }, }, metav1.CreateOptions{}) diff --git a/test/integration/concierge_kubectl_test.go b/test/integration/concierge_kubectl_test.go index 04ba3bbd..a113c465 100644 --- a/test/integration/concierge_kubectl_test.go +++ b/test/integration/concierge_kubectl_test.go @@ -15,7 +15,7 @@ import ( // Smoke test to see if the kubeconfig works and the cluster is reachable. func TestGetNodes(t *testing.T) { - testlib.SkipUnlessIntegration(t) + _ = testlib.IntegrationEnv(t) cmd := exec.Command("kubectl", "get", "nodes") cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr diff --git a/test/integration/formposthtml_test.go b/test/integration/formposthtml_test.go index f44e1ae5..bfc900df 100644 --- a/test/integration/formposthtml_test.go +++ b/test/integration/formposthtml_test.go @@ -24,7 +24,10 @@ import ( "go.pinniped.dev/test/testlib/browsertest" ) -func TestFormPostHTML(t *testing.T) { +// safe to run in parallel with serial tests since it only interacts with a test local server, see main_test.go. +func TestFormPostHTML_Parallel(t *testing.T) { + _ = testlib.IntegrationEnv(t) + // Run a mock callback handler, simulating the one running in the CLI. callbackURL, expectCallback := formpostCallbackServer(t) diff --git a/test/integration/ldap_client_test.go b/test/integration/ldap_client_test.go index 99a6b7fb..9c21698c 100644 --- a/test/integration/ldap_client_test.go +++ b/test/integration/ldap_client_test.go @@ -24,7 +24,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestLDAPSearch(t *testing.T) { +// safe to run in parallel with serial tests since it only makes read requests to our test LDAP server, see main_test.go. +func TestLDAPSearch_Parallel(t *testing.T) { // This test does not interact with Kubernetes itself. It is a test of our LDAP client code, and only interacts // with our test OpenLDAP server, which is exposed directly to this test via kubectl port-forward. // Theoretically we should always be able to run this test, but something about the kubectl port forwarding diff --git a/test/integration/leaderelection_test.go b/test/integration/leaderelection_test.go index 9f5eaecd..d9da6c57 100644 --- a/test/integration/leaderelection_test.go +++ b/test/integration/leaderelection_test.go @@ -6,7 +6,6 @@ package integration import ( "context" "encoding/json" - "errors" "testing" "time" @@ -28,19 +27,18 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestLeaderElection(t *testing.T) { +// safe to run in parallel with serial tests since it only interacts with a test local lease, see main_test.go. +func TestLeaderElection_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t) - t.Parallel() - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) t.Cleanup(cancel) leaseName := "leader-election-" + rand.String(5) namespace := testlib.CreateNamespace(ctx, t, leaseName) - clients := leaderElectionClients(t, namespace, leaseName) + clients, cancels := leaderElectionClients(t, namespace, leaseName) // the tests below are order dependant to some degree and definitely cannot be run in parallel @@ -68,9 +66,52 @@ func TestLeaderElection(t *testing.T) { lease := checkOnlyLeaderCanWrite(ctx, t, namespace, leaseName, clients) logLease(t, lease) }) + + t.Run("stop current leader", func(t *testing.T) { + startLease := waitForIdentity(ctx, t, namespace, leaseName, clients) + startTransitions := *startLease.Spec.LeaseTransitions + startTime := *startLease.Spec.AcquireTime + startLeaderIdentity := *startLease.Spec.HolderIdentity + + leaderClient := clients[startLeaderIdentity] + + err := runWriteRequest(ctx, leaderClient) + require.NoError(t, err) + + // emulate stopping the leader process + cancels[startLeaderIdentity]() + delete(clients, startLeaderIdentity) + + testlib.RequireEventually(t, func(requireEventually *require.Assertions) { + err := runWriteRequest(ctx, leaderClient) + requireEventually.ErrorIs(err, leaderelection.ErrNotLeader, "leader should no longer be able to write") + }, time.Minute, time.Second) + + if len(clients) > 0 { + finalLease := waitForIdentity(ctx, t, namespace, leaseName, clients) + finalTransitions := *finalLease.Spec.LeaseTransitions + finalTime := *finalLease.Spec.AcquireTime + finalLeaderIdentity := *finalLease.Spec.HolderIdentity + + require.Greater(t, finalTransitions, startTransitions) + require.Greater(t, finalTime.UnixNano(), startTime.UnixNano()) + require.NotEqual(t, startLeaderIdentity, finalLeaderIdentity, "should have elected new leader") + + logLease(t, finalLease) + } + }) + + t.Run("sanity check write prevention after stopping leader", func(t *testing.T) { + if len(clients) == 0 { + t.Skip("no clients left to check") + } + + lease := checkOnlyLeaderCanWrite(ctx, t, namespace, leaseName, clients) + logLease(t, lease) + }) } -func leaderElectionClient(t *testing.T, namespace *corev1.Namespace, leaseName, identity string) *kubeclient.Client { +func leaderElectionClient(t *testing.T, namespace *corev1.Namespace, leaseName, identity string) (*kubeclient.Client, context.CancelFunc) { t.Helper() podInfo := &downward.PodInfo{ @@ -119,23 +160,24 @@ func leaderElectionClient(t *testing.T, namespace *corev1.Namespace, leaseName, leaderCancel() }() - return client + return client, controllerCancel } -func leaderElectionClients(t *testing.T, namespace *corev1.Namespace, leaseName string) map[string]*kubeclient.Client { +func leaderElectionClients(t *testing.T, namespace *corev1.Namespace, leaseName string) (map[string]*kubeclient.Client, map[string]context.CancelFunc) { t.Helper() count := rand.IntnRange(1, 6) - out := make(map[string]*kubeclient.Client, count) + clients := make(map[string]*kubeclient.Client, count) + cancels := make(map[string]context.CancelFunc, count) for i := 0; i < count; i++ { identity := "leader-election-client-" + rand.String(5) - out[identity] = leaderElectionClient(t, namespace, leaseName, identity) + clients[identity], cancels[identity] = leaderElectionClient(t, namespace, leaseName, identity) } - t.Logf("running leader election client tests with %d clients: %v", len(out), sets.StringKeySet(out).List()) + t.Logf("running leader election client tests with %d clients: %v", len(clients), sets.StringKeySet(clients).List()) - return out + return clients, cancels } func pickRandomLeaderElectionClient(clients map[string]*kubeclient.Client) *kubeclient.Client { @@ -155,14 +197,17 @@ func waitForIdentity(ctx context.Context, t *testing.T, namespace *corev1.Namesp testlib.RequireEventuallyWithoutError(t, func() (bool, error) { lease, err := pickRandomLeaderElectionClient(clients).Kubernetes.CoordinationV1().Leases(namespace.Name).Get(ctx, leaseName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { + t.Logf("lease %s/%s does not exist", namespace.Name, leaseName) return false, nil } if err != nil { return false, err } out = lease + t.Logf("lease %s/%s - current leader identity: %s, valid leader identities: %s", + namespace.Name, leaseName, pointer.StringDeref(lease.Spec.HolderIdentity, ""), identities.List()) return lease.Spec.HolderIdentity != nil && identities.Has(*lease.Spec.HolderIdentity), nil - }, 3*time.Minute, time.Second) + }, 10*time.Minute, 10*time.Second) return out } @@ -209,12 +254,12 @@ func checkOnlyLeaderCanWrite(ctx context.Context, t *testing.T, namespace *corev } else { nonLeaders++ requireEventually.Error(err, "non leader client %q should have write error but it was nil", identity) - requireEventually.True(errors.Is(err, leaderelection.ErrNotLeader), "non leader client %q should have write error: %v", identity, err) + requireEventually.ErrorIs(err, leaderelection.ErrNotLeader, "non leader client %q should have write error: %v", identity, err) } } requireEventually.Equal(1, leaders, "did not see leader") requireEventually.Equal(len(clients)-1, nonLeaders, "did not see non-leader") - }, time.Minute, time.Second) + }, 3*time.Minute, 3*time.Second) return lease } @@ -231,7 +276,7 @@ func forceTransition(ctx context.Context, t *testing.T, namespace *corev1.Namesp startTime = *startLease.Spec.AcquireTime startLease = startLease.DeepCopy() - startLease.Spec.HolderIdentity = pointer.String("some-other-client" + rand.String(5)) + startLease.Spec.HolderIdentity = pointer.String("some-other-client-" + rand.String(5)) _, err := pickCurrentLeaderClient(ctx, t, namespace, leaseName, clients). Kubernetes.CoordinationV1().Leases(namespace.Name).Update(ctx, startLease, metav1.UpdateOptions{}) @@ -246,8 +291,6 @@ func forceTransition(ctx context.Context, t *testing.T, namespace *corev1.Namesp require.Greater(t, finalTransitions, startTransitions) require.Greater(t, finalTime.UnixNano(), startTime.UnixNano()) - time.Sleep(2 * time.Minute) // need to give clients time to notice this change because leader election is polling based - return finalLease } @@ -264,8 +307,6 @@ func forceRestart(ctx context.Context, t *testing.T, namespace *corev1.Namespace require.Zero(t, *newLease.Spec.LeaseTransitions) require.Greater(t, newLease.Spec.AcquireTime.UnixNano(), startLease.Spec.AcquireTime.UnixNano()) - time.Sleep(2 * time.Minute) // need to give clients time to notice this change because leader election is polling based - return newLease } diff --git a/test/integration/main_test.go b/test/integration/main_test.go new file mode 100644 index 00000000..6147da4c --- /dev/null +++ b/test/integration/main_test.go @@ -0,0 +1,118 @@ +// Copyright 2020-2021 the Pinniped contributors. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package integration + +import ( + "os" + "reflect" + "strings" + "testing" + "unsafe" + + "go.pinniped.dev/test/testlib" +) + +func TestMain(m *testing.M) { + splitIntegrationTestsIntoBuckets(m) + os.Exit(m.Run()) +} + +func splitIntegrationTestsIntoBuckets(m *testing.M) { + // this is some dark magic to set a private field + testsField := reflect.ValueOf(m).Elem().FieldByName("tests") + testsPointer := (*[]testing.InternalTest)(unsafe.Pointer(testsField.UnsafeAddr())) + + tests := *testsPointer + + if len(tests) == 0 { + return + } + + var serialTests, parallelTests, disruptiveTests, finalTests []testing.InternalTest + + for _, test := range tests { + test := test + + // top level integration tests the end with the string _Parallel + // are indicating that they are safe to run in parallel with + // other serial tests (which Go does not let you easily express). + // top level tests that want the standard Go behavior of only running + // parallel tests with other parallel tests should use the regular + // t.Parallel() approach. this has no effect on any subtest. + switch { + case strings.HasSuffix(test.Name, "_Parallel"): + parallelTests = append(parallelTests, test) + + // top level integration tests the end with the string _Disruptive + // are indicating that they are never safe to run with any other + // test because they break the underlying cluster in some way. + case strings.HasSuffix(test.Name, "_Disruptive"): + disruptiveTests = append(disruptiveTests, test) + + default: + serialTests = append(serialTests, test) + } + } + + serialTest := testing.InternalTest{ + Name: "TestIntegrationSerial", + F: func(t *testing.T) { + _ = testlib.IntegrationEnv(t) // make sure these tests do not run during unit tests + t.Parallel() // outer test always runs in parallel for this bucket + + for _, test := range serialTests { + test := test + t.Run(test.Name, func(t *testing.T) { + test.F(t) // inner serial tests do not run in parallel + }) + } + }, + } + + parallelTest := testing.InternalTest{ + Name: "TestIntegrationParallel", + F: func(t *testing.T) { + _ = testlib.IntegrationEnv(t) // make sure these tests do not run during unit tests + t.Parallel() // outer test always runs in parallel for this bucket + + for _, test := range parallelTests { + test := test + t.Run(test.Name, func(t *testing.T) { + t.Parallel() // inner parallel tests do run in parallel + + test.F(t) + }) + } + }, + } + + disruptiveTest := testing.InternalTest{ + Name: "TestIntegrationDisruptive", + F: func(t *testing.T) { + _ = testlib.IntegrationEnv(t) // make sure these tests do not run during unit tests + // outer test never runs in parallel for this bucket + + for _, test := range disruptiveTests { + test := test + t.Run(test.Name, func(t *testing.T) { + test.F(t) // inner disruptive tests do not run in parallel + }) + } + }, + } + + if len(parallelTests) > 0 { + finalTests = append(finalTests, parallelTest) + } + + if len(serialTests) > 0 { + finalTests = append(finalTests, serialTest) + } + + if len(disruptiveTests) > 0 { + finalTests = append(finalTests, disruptiveTest) + } + + *testsPointer = finalTests +} diff --git a/test/integration/supervisor_discovery_test.go b/test/integration/supervisor_discovery_test.go index 8bc48a5f..08f499bf 100644 --- a/test/integration/supervisor_discovery_test.go +++ b/test/integration/supervisor_discovery_test.go @@ -39,7 +39,8 @@ import ( // // Testing talking to the supervisor's port 8443 where the supervisor is terminating TLS itself is // handled by the others tests in this file. -func TestSupervisorOIDCDiscovery(t *testing.T) { +// Never run this test in parallel since deleting all federation domains is disruptive, see main_test.go. +func TestSupervisorOIDCDiscovery_Disruptive(t *testing.T) { env := testlib.IntegrationEnv(t) client := testlib.NewSupervisorClientset(t) @@ -143,7 +144,8 @@ func TestSupervisorOIDCDiscovery(t *testing.T) { } } -func TestSupervisorTLSTerminationWithSNI(t *testing.T) { +// Never run this test in parallel since deleting all federation domains is disruptive, see main_test.go. +func TestSupervisorTLSTerminationWithSNI_Disruptive(t *testing.T) { env := testlib.IntegrationEnv(t) pinnipedClient := testlib.NewSupervisorClientset(t) kubeClient := testlib.NewKubernetesClientset(t) @@ -214,7 +216,8 @@ func TestSupervisorTLSTerminationWithSNI(t *testing.T) { }) } -func TestSupervisorTLSTerminationWithDefaultCerts(t *testing.T) { +// Never run this test in parallel since deleting all federation domains is disruptive, see main_test.go. +func TestSupervisorTLSTerminationWithDefaultCerts_Disruptive(t *testing.T) { env := testlib.IntegrationEnv(t) pinnipedClient := testlib.NewSupervisorClientset(t) kubeClient := testlib.NewKubernetesClientset(t) diff --git a/test/integration/supervisor_secrets_test.go b/test/integration/supervisor_secrets_test.go index 949a27cc..65ff64a8 100644 --- a/test/integration/supervisor_secrets_test.go +++ b/test/integration/supervisor_secrets_test.go @@ -18,7 +18,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestSupervisorSecrets(t *testing.T) { +// safe to run in parallel with serial tests since it only interacts with a test local federation domain, see main_test.go. +func TestSupervisorSecrets_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t) kubeClient := testlib.NewKubernetesClientset(t) supervisorClient := testlib.NewSupervisorClientset(t) diff --git a/test/integration/supervisor_storage_garbage_collection_test.go b/test/integration/supervisor_storage_garbage_collection_test.go index 52d70157..588c0779 100644 --- a/test/integration/supervisor_storage_garbage_collection_test.go +++ b/test/integration/supervisor_storage_garbage_collection_test.go @@ -19,11 +19,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestStorageGarbageCollection(t *testing.T) { - // Run this test in parallel with the other integration tests because it does a lot of waiting - // and will not impact other tests, or be impacted by other tests, when run in parallel. - t.Parallel() - +// safe to run in parallel with serial tests since it only interacts with test local secrets, see main_test.go. +func TestStorageGarbageCollection_Parallel(t *testing.T) { env := testlib.IntegrationEnv(t) client := testlib.NewKubernetesClientset(t) secrets := client.CoreV1().Secrets(env.SupervisorNamespace) diff --git a/test/integration/whoami_test.go b/test/integration/whoami_test.go index fe708613..17bae321 100644 --- a/test/integration/whoami_test.go +++ b/test/integration/whoami_test.go @@ -29,7 +29,8 @@ import ( "go.pinniped.dev/test/testlib" ) -func TestWhoAmI_Kubeadm(t *testing.T) { +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_Kubeadm_Parallel(t *testing.T) { // use the cluster signing key being available as a proxy for this being a kubeadm cluster // we should add more robust logic around skipping clusters based on vendor _ = testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) @@ -60,7 +61,8 @@ func TestWhoAmI_Kubeadm(t *testing.T) { ) } -func TestWhoAmI_ServiceAccount_Legacy(t *testing.T) { +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_ServiceAccount_Legacy_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) @@ -133,8 +135,9 @@ func TestWhoAmI_ServiceAccount_Legacy(t *testing.T) { ) } -func TestWhoAmI_ServiceAccount_TokenRequest(t *testing.T) { - _ = testlib.IntegrationEnv(t) +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_ServiceAccount_TokenRequest_Parallel(t *testing.T) { + env := testlib.IntegrationEnv(t) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) defer cancel() @@ -168,9 +171,10 @@ func TestWhoAmI_ServiceAccount_TokenRequest(t *testing.T) { corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "ignored-but-required", - Image: "busybox", - Command: []string{"sh", "-c", "sleep 3600"}, + Name: "sleeper", + Image: env.ShellContainerImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"sh", "-c", "sleep 3600"}, }, }, ServiceAccountName: sa.Name, @@ -241,7 +245,8 @@ func TestWhoAmI_ServiceAccount_TokenRequest(t *testing.T) { ) } -func TestWhoAmI_CSR(t *testing.T) { +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_CSR_Parallel(t *testing.T) { // use the cluster signing key being available as a proxy for this not being an EKS cluster // we should add more robust logic around skipping clusters based on vendor _ = testlib.IntegrationEnv(t).WithCapability(testlib.ClusterSigningKeyIsAvailable) @@ -329,7 +334,8 @@ func TestWhoAmI_CSR(t *testing.T) { ) } -func TestWhoAmI_Anonymous(t *testing.T) { +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_Anonymous_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t).WithCapability(testlib.AnonymousAuthenticationSupported) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) @@ -359,7 +365,8 @@ func TestWhoAmI_Anonymous(t *testing.T) { ) } -func TestWhoAmI_ImpersonateDirectly(t *testing.T) { +// whoami requests are non-mutating and safe to run in parallel with serial tests, see main_test.go. +func TestWhoAmI_ImpersonateDirectly_Parallel(t *testing.T) { _ = testlib.IntegrationEnv(t) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) diff --git a/test/testlib/env.go b/test/testlib/env.go index 22085fbd..a5fc8847 100644 --- a/test/testlib/env.go +++ b/test/testlib/env.go @@ -54,6 +54,7 @@ type TestEnv struct { SupervisorHTTPSIngressCABundle string `json:"supervisorHttpsIngressCABundle"` Proxy string `json:"proxy"` APIGroupSuffix string `json:"apiGroupSuffix"` + ShellContainerImage string `json:"shellContainer"` TestUser struct { Token string `json:"token"` @@ -127,7 +128,7 @@ func IntegrationEnv(t *testing.T) *TestEnv { } t.Helper() - SkipUnlessIntegration(t) + skipUnlessIntegration(t) capabilitiesDescriptionYAML := os.Getenv("PINNIPED_TEST_CLUSTER_CAPABILITY_YAML") capabilitiesDescriptionFile := os.Getenv("PINNIPED_TEST_CLUSTER_CAPABILITY_FILE") @@ -232,6 +233,7 @@ func loadEnvVars(t *testing.T, result *TestEnv) { result.Proxy = os.Getenv("PINNIPED_TEST_PROXY") result.APIGroupSuffix = wantEnv("PINNIPED_TEST_API_GROUP_SUFFIX", "pinniped.dev") + result.ShellContainerImage = needEnv(t, "PINNIPED_TEST_SHELL_CONTAINER_IMAGE") result.CLIUpstreamOIDC = TestOIDCUpstream{ Issuer: needEnv(t, "PINNIPED_TEST_CLI_OIDC_ISSUER"), diff --git a/test/testlib/skip.go b/test/testlib/skip.go index 75e0e9fc..6f7de643 100644 --- a/test/testlib/skip.go +++ b/test/testlib/skip.go @@ -5,8 +5,8 @@ package testlib import "testing" -// SkipUnlessIntegration skips the current test if `-short` has been passed to `go test`. -func SkipUnlessIntegration(t *testing.T) { +// skipUnlessIntegration skips the current test if `-short` has been passed to `go test`. +func skipUnlessIntegration(t *testing.T) { t.Helper() if testing.Short() { t.Skip("skipping integration test because of '-short' flag")