Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: unreliable webhook behaviour on gatekeeper pod startup and shutdown #3780

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ import (
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"syscall"
"time"

"github.com/go-logr/zapr"
Expand Down Expand Up @@ -117,6 +119,7 @@ var (
enableK8sCel = flag.Bool("enable-k8s-native-validation", true, "enable the validating admission policy driver")
externaldataProviderResponseCacheTTL = flag.Duration("external-data-provider-response-cache-ttl", 3*time.Minute, "TTL for the external data provider response cache. Specify the duration in 'h', 'm', or 's' for hours, minutes, or seconds respectively. Defaults to 3 minutes if unspecified. Setting the TTL to 0 disables the cache.")
enableReferential = flag.Bool("enable-referential-rules", true, "Enable referential rules. This flag defaults to true. Set this value to false if you want to disallow referential constraints. Because referential constraints read objects other than the object-under-test, they may be subject to race conditions. Users concerned about this may want to disable referential rules")
shutdownDelay = flag.Int("shutdown-delay", 10, "Time in seconds the controller runtime shutdown gets delayed after receiving a pod termination event. Prevents failing webhooks on pod shutdown. default: 10")
)

func init() {
Expand Down Expand Up @@ -309,9 +312,39 @@ func innerMain() int {
}
}

// Always enable downstream checking for the webhooks, if enabled.
if len(webhooks) > 0 {
tlsChecker := webhook.NewTLSChecker(*certDir, *port)
setupLog.Info("setting up TLS readiness probe")
if err := mgr.AddReadyzCheck("tls-check", tlsChecker); err != nil {
setupLog.Error(err, "unable to create tls readiness check")
return 1
}
}

// Setup controllers asynchronously, they will block for certificate generation if needed.
setupErr := make(chan error)
ctx := ctrl.SetupSignalHandler()

// Setup termination with grace period. Required to give K8s Services time to disconnect the Pod endpoint on termination.
// Derived from how the controller-runtime sets up a signal handler with ctrl.SetupSignalHandler()
// controller-runtime upstream issue: https://github.com/kubernetes-sigs/controller-runtime/issues/3113
ctx, cancel := context.WithCancel(context.Background())

c := make(chan os.Signal, 2)
signal.Notify(c, []os.Signal{os.Interrupt, syscall.SIGTERM}...)
go func() {
<-c
setupLog.Info("Shutting Down, waiting for 10s")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should get this value from shutdownDelay instead of hardcoding it.

go func() {
time.Sleep(time.Duration(*shutdownDelay) * time.Second)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly we should add a test to validate this:
e.g.

# Get the name of the Gatekeeper pod
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')

# Trigger the termination of the Gatekeeper pod
TERMINATION_START=$(date +%s)
kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE}

# Monitor the termination process to ensure the grace period is respected
echo "Monitoring the termination process..."
kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found &

# Wait for the pod to be fully terminated
kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s

TERMINATION_END=$(date +%s)
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START))

# Validate that the pod is terminated after the specified grace period
GRACE_PERIOD=10
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then
    echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds."
else
    echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds."
fi

setupLog.Info("Shutdown grace period finished")
cancel()
}()
<-c
setupLog.Info("Second signal received, killing now")
os.Exit(1) // second signal. Exit directly.
}()

go func() {
setupErr <- setupControllers(ctx, mgr, tracker, setupFinished)
}()
Expand Down
Loading