ContainerImage.Pinniped/test/integration/api_serving_certs_test.go
Andrew Keesler a4fe76f6a9
test/integration: increase confidence that a cert has rotated
It looks like requests to our aggregated API service on GKE vacillate
between success and failure until they reach a converged successful
state. I think this has to do with our pods updating the API serving
cert at different times. If only one pod updates its serving cert to
the correct value, then it should respond with success. However, the
other pod would respond with failure. Depending on the load balancing
algorithm that GKE uses to send traffic to pods in a service, we could
end up with a success that we interpret as "all pods have rotated
their certs" when it really just means "at least one pod has rotated
its certs."

Signed-off-by: Andrew Keesler <akeesler@vmware.com>
2020-08-28 10:20:05 -04:00

168 lines
6.1 KiB
Go

/*
Copyright 2020 VMware, Inc.
SPDX-License-Identifier: Apache-2.0
*/
package integration
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"github.com/suzerain-io/pinniped/generated/1.19/apis/pinniped/v1alpha1"
"github.com/suzerain-io/pinniped/internal/testutil"
"github.com/suzerain-io/pinniped/test/library"
)
func TestAPIServingCertificateAutoCreationAndRotation(t *testing.T) {
library.SkipUnlessIntegration(t)
tests := []struct {
name string
forceRotation func(context.Context, kubernetes.Interface, string) error
}{
{
name: "manual",
forceRotation: func(
ctx context.Context,
kubeClient kubernetes.Interface,
namespace string,
) error {
// Delete the Secret, simulating an end user doing `kubectl delete` to manually ask for an immediate rotation.
return kubeClient.
CoreV1().
Secrets(namespace).
Delete(ctx, "api-serving-cert", metav1.DeleteOptions{})
},
},
{
name: "automatic",
forceRotation: func(
ctx context.Context,
kubeClient kubernetes.Interface,
namespace string,
) error {
// Create a cert that is expired - this should force the rotation controller
// to delete the cert, and therefore the cert should get rotated.
secret, err := kubeClient.
CoreV1().
Secrets(namespace).
Get(ctx, "api-serving-cert", metav1.GetOptions{})
if err != nil {
return err
}
secret.Data["tlsCertificateChain"], err = createExpiredCertificate()
if err != nil {
return err
}
_, err = kubeClient.
CoreV1().
Secrets(namespace).
Update(ctx, secret, metav1.UpdateOptions{})
return err
},
},
}
for _, test := range tests {
test := test
t.Run(test.name, func(t *testing.T) {
namespaceName := library.GetEnv(t, "PINNIPED_NAMESPACE")
kubeClient := library.NewClientset(t)
aggregatedClient := library.NewAggregatedClientset(t)
pinnipedClient := library.NewPinnipedClientset(t)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
const apiServiceName = "v1alpha1.pinniped.dev"
// Get the initial auto-generated version of the Secret.
secret, err := kubeClient.CoreV1().Secrets(namespaceName).Get(ctx, "api-serving-cert", metav1.GetOptions{})
require.NoError(t, err)
initialCACert := secret.Data["caCertificate"]
initialPrivateKey := secret.Data["tlsPrivateKey"]
initialCertChain := secret.Data["tlsCertificateChain"]
require.NotEmpty(t, initialCACert)
require.NotEmpty(t, initialPrivateKey)
require.NotEmpty(t, initialCertChain)
// Check that the APIService has the same CA.
apiService, err := aggregatedClient.ApiregistrationV1().APIServices().Get(ctx, apiServiceName, metav1.GetOptions{})
require.NoError(t, err)
require.Equal(t, initialCACert, apiService.Spec.CABundle)
// Force rotation to happen.
require.NoError(t, test.forceRotation(ctx, kubeClient, namespaceName))
// Expect that the Secret comes back right away with newly minted certs.
secretIsRegenerated := func() bool {
secret, err = kubeClient.CoreV1().Secrets(namespaceName).Get(ctx, "api-serving-cert", metav1.GetOptions{})
return err == nil
}
assert.Eventually(t, secretIsRegenerated, 10*time.Second, 250*time.Millisecond)
require.NoError(t, err) // prints out the error and stops the test in case of failure
regeneratedCACert := secret.Data["caCertificate"]
regeneratedPrivateKey := secret.Data["tlsPrivateKey"]
regeneratedCertChain := secret.Data["tlsCertificateChain"]
require.NotEmpty(t, regeneratedCACert)
require.NotEmpty(t, regeneratedPrivateKey)
require.NotEmpty(t, regeneratedCertChain)
require.NotEqual(t, initialCACert, regeneratedCACert)
require.NotEqual(t, initialPrivateKey, regeneratedPrivateKey)
require.NotEqual(t, initialCertChain, regeneratedCertChain)
// Expect that the APIService was also updated with the new CA.
aggregatedAPIUpdated := func() bool {
apiService, err = aggregatedClient.ApiregistrationV1().APIServices().Get(ctx, apiServiceName, metav1.GetOptions{})
return err == nil
}
assert.Eventually(t, aggregatedAPIUpdated, 10*time.Second, 250*time.Millisecond)
require.NoError(t, err) // prints out the error and stops the test in case of failure
require.Equal(t, regeneratedCACert, apiService.Spec.CABundle)
// Check that we can still make requests to the aggregated API through the kube API server,
// because the kube API server uses these certs when proxying requests to the aggregated API server,
// so this is effectively checking that the aggregated API server is using these new certs.
// We ensure that 10 straight requests succeed so that we filter out false positives where a single
// pod has rotated their cert, but not the other ones sitting behind the service.
aggregatedAPIWorking := func() bool {
for i := 0; i < 10; i++ {
_, err = pinnipedClient.PinnipedV1alpha1().CredentialRequests().Create(ctx, &v1alpha1.CredentialRequest{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{},
Spec: v1alpha1.CredentialRequestSpec{
Type: v1alpha1.TokenCredentialType,
Token: &v1alpha1.CredentialRequestTokenCredential{Value: "not a good token"},
},
}, metav1.CreateOptions{})
if err != nil {
break
}
}
// Should have got a success response with an error message inside it complaining about the token value.
return err == nil
}
// Unfortunately, although our code changes all the certs immediately, it seems to take ~1 minute for
// the API machinery to notice that we updated our serving cert, causing 1 minute of downtime for our endpoint.
assert.Eventually(t, aggregatedAPIWorking, 2*time.Minute, 250*time.Millisecond)
require.NoError(t, err) // prints out the error and stops the test in case of failure
})
}
}
func createExpiredCertificate() ([]byte, error) {
return testutil.CreateCertificate(
time.Now().Add(-24*time.Hour), // notBefore
time.Now().Add(-time.Hour), // notAfter
)
}