make plank settings configurable

inteon · inteon · commit 5cc1fc814346 · 2024-07-01T18:15:17.000+02:00
Signed-off-by: Tim Ramlot &lt;42113979+inteon@users.noreply.github.com&gt;
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -655,6 +655,17 @@ type Plank struct {
 	// stuck in an unscheduled state. Defaults to 5 minutes.
 	PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`
 
+	// MaxRetries is the maximum number of times a prowjob will be retried before
+	// being marked as failed. Defaults to 3. A value of 0 means no retries.
+	MaxRetries *int `json:"max_retries,omitempty"`
+
+	// NodeTerminationReasons is a set of reasons on which the controller will
+	// match to determine if a node is being terminated. If a node is being terminated
+	// the controller will restart the prowjob, unless the ErrorOnTermination option is set
+	// on the prowjob or the MaxRetries option is reached.
+	// Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
+	NodeTerminationReasons []string `json:"node_termination_reasons,omitempty"`
+
 	// DefaultDecorationConfigs holds the default decoration config for specific values.
 	//
 	// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2495,6 +2506,24 @@ func parseProwConfig(c *Config) error {
 		c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
 	}
 
+	if c.Plank.MaxRetries == nil {
+		maxRetries := 3
+		c.Plank.MaxRetries = &maxRetries
+	}
+
+	if c.Plank.NodeTerminationReasons == nil {
+		c.Plank.NodeTerminationReasons = []string{
+			// If the node does no longer exist and the pod gets garbage collected,
+			// this condition will be set:
+			// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
+			"DeletionByPodGC",
+			// On GCP, before a new spot instance is started, the old pods are garbage
+			// collected (if they have not been already by the Kubernetes PodGC):
+			// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
+			"DeletionByGCPControllerManager",
+		}
+	}
+
 	if err := c.Gerrit.DefaultAndValidate(); err != nil {
 		return fmt.Errorf("validating gerrit config: %w", err)
 	}
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
@@ -8417,6 +8417,10 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_retries: 3
+  node_termination_reasons:
+  - DeletionByPodGC
+  - DeletionByGCPControllerManager
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8501,6 +8505,10 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_retries: 3
+  node_termination_reasons:
+  - DeletionByPodGC
+  - DeletionByGCPControllerManager
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8578,6 +8586,10 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_retries: 3
+  node_termination_reasons:
+  - DeletionByPodGC
+  - DeletionByGCPControllerManager
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
@@ -8660,6 +8672,10 @@ moonraker:
   client_timeout: 10m0s
 plank:
   max_goroutines: 20
+  max_retries: 3
+  node_termination_reasons:
+  - DeletionByPodGC
+  - DeletionByGCPControllerManager
   pod_pending_timeout: 10m0s
   pod_running_timeout: 48h0m0s
   pod_unscheduled_timeout: 5m0s
diff --git a/pkg/config/prow-config-documented.yaml b/pkg/config/prow-config-documented.yaml
@@ -1013,6 +1013,16 @@ plank:
     # JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
     # automatically appended to the JobURLPrefix.
     jobURLPrefixDisableAppendStorageProvider: true
+    # MaxRetries is the maximum number of times a prowjob will be retried before
+    # being marked as failed. Defaults to 3. A value of 0 means no retries.
+    max_retries: 0
+    # NodeTerminationReasons is a set of reasons on which the controller will
+    # match to determine if a node is being terminated. If a node is being terminated
+    # the controller will restart the prowjob, unless the ErrorOnTermination option is set
+    # on the prowjob or the MaxRetries option is reached.
+    # Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
+    node_termination_reasons:
+        - ""
     # PodPendingTimeout defines how long the controller will wait to perform a garbage
     # collection on pending pods. Defaults to 10 minutes.
     pod_pending_timeout: 0s
diff --git a/pkg/plank/controller_test.go b/pkg/plank/controller_test.go
@@ -65,6 +65,11 @@ const (
 	podDeletionPreventionFinalizer = "keep-from-vanishing"
 )
 
+var (
+	maxRetries             = 3
+	nodeTerminationReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"}
+)
+
 func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
 	presubmits := []config.Presubmit{
 		{
@@ -102,10 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
 						MaxConcurrency: maxConcurrency,
 						MaxGoroutines:  20,
 					},
-					JobQueueCapacities:    queueCapacities,
-					PodPendingTimeout:     &metav1.Duration{Duration: podPendingTimeout},
-					PodRunningTimeout:     &metav1.Duration{Duration: podRunningTimeout},
-					PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
+					JobQueueCapacities:     queueCapacities,
+					PodPendingTimeout:      &metav1.Duration{Duration: podPendingTimeout},
+					PodRunningTimeout:      &metav1.Duration{Duration: podRunningTimeout},
+					PodUnscheduledTimeout:  &metav1.Duration{Duration: podUnscheduledTimeout},
+					MaxRetries:             &maxRetries,
+					NodeTerminationReasons: nodeTerminationReasons,
 				},
 			},
 			JobConfig: config.JobConfig{
diff --git a/pkg/plank/reconciler.go b/pkg/plank/reconciler.go
@@ -21,6 +21,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"slices"
 	"strings"
 	"sync"
 	"time"
@@ -60,8 +61,6 @@ import (
 
 const ControllerName = "plank"
 
-const MaxPodRetries = 3
-
 // PodStatus constants
 const (
 	Evicted    = "Evicted"
@@ -464,7 +463,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
 			pj.Status.PodName = pn
 			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
 		}
-	} else if transientFailure := getTransientFailure(pod); transientFailure != PodTransientFailureNone {
+	} else if transientFailure := getTransientFailure(pod, r.config().Plank.NodeTerminationReasons); transientFailure != PodTransientFailureNone {
 		switch {
 		case transientFailure == PodTransientFailureEvicted && pj.Spec.ErrorOnEviction:
 			// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
@@ -478,12 +477,12 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
 			pj.SetComplete()
 			pj.Status.State = prowv1.ErrorState
 			pj.Status.Description = "Job pod's node was terminated."
-		case pj.Status.RetryCount >= MaxPodRetries:
+		case pj.Status.RetryCount >= *r.config().Plank.MaxRetries:
 			// MaxPodRetries is reached, complete the PJ and mark it as errored.
 			r.log.WithField("transient-failure", transientFailure).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
 			pj.SetComplete()
 			pj.Status.State = prowv1.ErrorState
-			pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", MaxPodRetries, transientFailure)
+			pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", pj.Status.RetryCount, transientFailure)
 		default:
 			// Update the retry count and delete the pod so it gets recreated in the next resync.
 			pj.Status.RetryCount++
@@ -666,7 +665,7 @@ const (
 	PodTransientFailureUnreachable PodTransientFailure = "unreachable"
 )
 
-func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
+func getTransientFailure(pod *corev1.Pod, nodeTerminationReasons []string) PodTransientFailure {
 	if pod.Status.Reason == Evicted {
 		return PodTransientFailureEvicted
 	}
@@ -679,17 +678,7 @@ func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
 	}
 
 	for _, condition := range pod.Status.Conditions {
-		// If the node does no longer exist and the pod gets garbage collected,
-		// this condition will be set:
-		// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
-		if condition.Reason == "DeletionByPodGC" {
-			return PodTransientFailureTerminated
-		}
-
-		// On GCP, before a new spot instance is started, the old pods are garbage
-		// collected (if they have not been already by the Kubernetes PodGC):
-		// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
-		if condition.Reason == "DeletionByGCPControllerManager" {
+		if slices.Contains(nodeTerminationReasons, condition.Reason) {
 			return PodTransientFailureTerminated
 		}
 	}