From ecde8fa8af5a0e1dde962079ac846beafa808a17 Mon Sep 17 00:00:00 2001
From: Ryan Richard <richardry@vmware.com>
Date: Mon, 17 Aug 2020 16:44:42 -0700
Subject: [PATCH] Implement basic liveness and readiness probes

- Call the auto-generated /healthz endpoint of our aggregated API server
- Use http for liveness even though tcp seems like it might be
  more appropriate, because tcp probes cause TLS handshake errors
  to appear in our logs every few seconds
- Use conservative timeouts and retries on the liveness probe to avoid
  having our container get restarted when it is temporarily slow due
  to running in an environment under resource pressure
- Use less conservative timeouts and retries for the readiness probe
  to remove an unhealthy pod from the service less conservatively than
  restarting the container
- Tuning the settings for retries and timeouts seem to be a mysterious
  art, so these are just a first draft
---
 deploy/deployment.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/deploy/deployment.yaml b/deploy/deployment.yaml
index f5ef824b..d0a3cac0 100644
--- a/deploy/deployment.yaml
+++ b/deploy/deployment.yaml
@@ -88,6 +88,24 @@ spec:
               mountPath: /etc/podinfo
             - name: k8s-certs
               mountPath: /etc/kubernetes/pki
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 443
+              scheme: HTTPS
+            initialDelaySeconds: 20
+            timeoutSeconds: 15
+            periodSeconds: 10
+            failureThreshold: 5
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 443
+              scheme: HTTPS
+            initialDelaySeconds: 20
+            timeoutSeconds: 3
+            periodSeconds: 10
+            failureThreshold: 3
       volumes:
         - name: config-volume
           configMap: