Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5cc1fc8

Browse files
committedJul 1, 2024··
make plank settings configurable
Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com>
1 parent 6a85753 commit 5cc1fc8

File tree

5 files changed

+72
-21
lines changed

5 files changed

+72
-21
lines changed
 

‎pkg/config/config.go

+29
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,17 @@ type Plank struct {
655655
// stuck in an unscheduled state. Defaults to 5 minutes.
656656
PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`
657657

658+
// MaxRetries is the maximum number of times a prowjob will be retried before
659+
// being marked as failed. Defaults to 3. A value of 0 means no retries.
660+
MaxRetries *int `json:"max_retries,omitempty"`
661+
662+
// NodeTerminationReasons is a set of reasons on which the controller will
663+
// match to determine if a node is being terminated. If a node is being terminated
664+
// the controller will restart the prowjob, unless the ErrorOnTermination option is set
665+
// on the prowjob or the MaxRetries option is reached.
666+
// Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
667+
NodeTerminationReasons []string `json:"node_termination_reasons,omitempty"`
668+
658669
// DefaultDecorationConfigs holds the default decoration config for specific values.
659670
//
660671
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
@@ -2495,6 +2506,24 @@ func parseProwConfig(c *Config) error {
24952506
c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
24962507
}
24972508

2509+
if c.Plank.MaxRetries == nil {
2510+
maxRetries := 3
2511+
c.Plank.MaxRetries = &maxRetries
2512+
}
2513+
2514+
if c.Plank.NodeTerminationReasons == nil {
2515+
c.Plank.NodeTerminationReasons = []string{
2516+
// If the node does no longer exist and the pod gets garbage collected,
2517+
// this condition will be set:
2518+
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
2519+
"DeletionByPodGC",
2520+
// On GCP, before a new spot instance is started, the old pods are garbage
2521+
// collected (if they have not been already by the Kubernetes PodGC):
2522+
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
2523+
"DeletionByGCPControllerManager",
2524+
}
2525+
}
2526+
24982527
if err := c.Gerrit.DefaultAndValidate(); err != nil {
24992528
return fmt.Errorf("validating gerrit config: %w", err)
25002529
}

‎pkg/config/config_test.go

+16
Original file line numberDiff line numberDiff line change
@@ -8417,6 +8417,10 @@ moonraker:
84178417
client_timeout: 10m0s
84188418
plank:
84198419
max_goroutines: 20
8420+
max_retries: 3
8421+
node_termination_reasons:
8422+
- DeletionByPodGC
8423+
- DeletionByGCPControllerManager
84208424
pod_pending_timeout: 10m0s
84218425
pod_running_timeout: 48h0m0s
84228426
pod_unscheduled_timeout: 5m0s
@@ -8501,6 +8505,10 @@ moonraker:
85018505
client_timeout: 10m0s
85028506
plank:
85038507
max_goroutines: 20
8508+
max_retries: 3
8509+
node_termination_reasons:
8510+
- DeletionByPodGC
8511+
- DeletionByGCPControllerManager
85048512
pod_pending_timeout: 10m0s
85058513
pod_running_timeout: 48h0m0s
85068514
pod_unscheduled_timeout: 5m0s
@@ -8578,6 +8586,10 @@ moonraker:
85788586
client_timeout: 10m0s
85798587
plank:
85808588
max_goroutines: 20
8589+
max_retries: 3
8590+
node_termination_reasons:
8591+
- DeletionByPodGC
8592+
- DeletionByGCPControllerManager
85818593
pod_pending_timeout: 10m0s
85828594
pod_running_timeout: 48h0m0s
85838595
pod_unscheduled_timeout: 5m0s
@@ -8660,6 +8672,10 @@ moonraker:
86608672
client_timeout: 10m0s
86618673
plank:
86628674
max_goroutines: 20
8675+
max_retries: 3
8676+
node_termination_reasons:
8677+
- DeletionByPodGC
8678+
- DeletionByGCPControllerManager
86638679
pod_pending_timeout: 10m0s
86648680
pod_running_timeout: 48h0m0s
86658681
pod_unscheduled_timeout: 5m0s

‎pkg/config/prow-config-documented.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,16 @@ plank:
10131013
# JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
10141014
# automatically appended to the JobURLPrefix.
10151015
jobURLPrefixDisableAppendStorageProvider: true
1016+
# MaxRetries is the maximum number of times a prowjob will be retried before
1017+
# being marked as failed. Defaults to 3. A value of 0 means no retries.
1018+
max_retries: 0
1019+
# NodeTerminationReasons is a set of reasons on which the controller will
1020+
# match to determine if a node is being terminated. If a node is being terminated
1021+
# the controller will restart the prowjob, unless the ErrorOnTermination option is set
1022+
# on the prowjob or the MaxRetries option is reached.
1023+
# Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
1024+
node_termination_reasons:
1025+
- ""
10161026
# PodPendingTimeout defines how long the controller will wait to perform a garbage
10171027
# collection on pending pods. Defaults to 10 minutes.
10181028
pod_pending_timeout: 0s

‎pkg/plank/controller_test.go

+11-4
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ const (
6565
podDeletionPreventionFinalizer = "keep-from-vanishing"
6666
)
6767

68+
var (
69+
maxRetries = 3
70+
nodeTerminationReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"}
71+
)
72+
6873
func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
6974
presubmits := []config.Presubmit{
7075
{
@@ -102,10 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
102107
MaxConcurrency: maxConcurrency,
103108
MaxGoroutines: 20,
104109
},
105-
JobQueueCapacities: queueCapacities,
106-
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
107-
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
108-
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
110+
JobQueueCapacities: queueCapacities,
111+
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
112+
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
113+
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
114+
MaxRetries: &maxRetries,
115+
NodeTerminationReasons: nodeTerminationReasons,
109116
},
110117
},
111118
JobConfig: config.JobConfig{

‎pkg/plank/reconciler.go

+6-17
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"slices"
2425
"strings"
2526
"sync"
2627
"time"
@@ -60,8 +61,6 @@ import (
6061

6162
const ControllerName = "plank"
6263

63-
const MaxPodRetries = 3
64-
6564
// PodStatus constants
6665
const (
6766
Evicted = "Evicted"
@@ -464,7 +463,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
464463
pj.Status.PodName = pn
465464
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
466465
}
467-
} else if transientFailure := getTransientFailure(pod); transientFailure != PodTransientFailureNone {
466+
} else if transientFailure := getTransientFailure(pod, r.config().Plank.NodeTerminationReasons); transientFailure != PodTransientFailureNone {
468467
switch {
469468
case transientFailure == PodTransientFailureEvicted && pj.Spec.ErrorOnEviction:
470469
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
@@ -478,12 +477,12 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
478477
pj.SetComplete()
479478
pj.Status.State = prowv1.ErrorState
480479
pj.Status.Description = "Job pod's node was terminated."
481-
case pj.Status.RetryCount >= MaxPodRetries:
480+
case pj.Status.RetryCount >= *r.config().Plank.MaxRetries:
482481
// MaxPodRetries is reached, complete the PJ and mark it as errored.
483482
r.log.WithField("transient-failure", transientFailure).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
484483
pj.SetComplete()
485484
pj.Status.State = prowv1.ErrorState
486-
pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", MaxPodRetries, transientFailure)
485+
pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", pj.Status.RetryCount, transientFailure)
487486
default:
488487
// Update the retry count and delete the pod so it gets recreated in the next resync.
489488
pj.Status.RetryCount++
@@ -666,7 +665,7 @@ const (
666665
PodTransientFailureUnreachable PodTransientFailure = "unreachable"
667666
)
668667

669-
func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
668+
func getTransientFailure(pod *corev1.Pod, nodeTerminationReasons []string) PodTransientFailure {
670669
if pod.Status.Reason == Evicted {
671670
return PodTransientFailureEvicted
672671
}
@@ -679,17 +678,7 @@ func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
679678
}
680679

681680
for _, condition := range pod.Status.Conditions {
682-
// If the node does no longer exist and the pod gets garbage collected,
683-
// this condition will be set:
684-
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
685-
if condition.Reason == "DeletionByPodGC" {
686-
return PodTransientFailureTerminated
687-
}
688-
689-
// On GCP, before a new spot instance is started, the old pods are garbage
690-
// collected (if they have not been already by the Kubernetes PodGC):
691-
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
692-
if condition.Reason == "DeletionByGCPControllerManager" {
681+
if slices.Contains(nodeTerminationReasons, condition.Reason) {
693682
return PodTransientFailureTerminated
694683
}
695684
}

0 commit comments

Comments
 (0)
Please sign in to comment.