Skip to content

Commit e416d6f

Browse files
committed
unify retry logic and add RetryCount and MaxPodRetries
Signed-off-by: Tim Ramlot <[email protected]>
1 parent ca08a58 commit e416d6f

File tree

2 files changed

+59
-80
lines changed

2 files changed

+59
-80
lines changed

pkg/apis/prowjobs/v1/types.go

+2
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,8 @@ type ProwJobStatus struct {
10531053
PendingTime *metav1.Time `json:"pendingTime,omitempty"`
10541054
// CompletionTime is the timestamp for when the job goes to a final state
10551055
CompletionTime *metav1.Time `json:"completionTime,omitempty"`
1056+
// Amount of times the Pod was retried.
1057+
RetryCount int `json:"retryCount,omitempty"`
10561058
// +kubebuilder:validation:Enum=scheduling;triggered;pending;success;failure;aborted;error
10571059
// +kubebuilder:validation:Required
10581060
State ProwJobState `json:"state,omitempty"`

pkg/plank/reconciler.go

+57-80
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ import (
6060

6161
const ControllerName = "plank"
6262

63+
const MaxPodRetries = 3
64+
6365
// PodStatus constants
6466
const (
6567
Evicted = "Evicted"
@@ -462,104 +464,57 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
462464
pj.Status.PodName = pn
463465
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
464466
}
465-
} else if pod.Status.Reason == Evicted {
466-
// Pod was evicted.
467-
if pj.Spec.ErrorOnEviction {
468-
// ErrorOnEviction is enabled, complete the PJ and mark it as
469-
// errored.
467+
} else if transientFailure := getTransientFailure(pod); transientFailure != PodTransientFailureNone {
468+
switch {
469+
case transientFailure == PodTransientFailureEvicted && pj.Spec.ErrorOnEviction:
470+
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
470471
r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
471472
pj.SetComplete()
472473
pj.Status.State = prowv1.ErrorState
473474
pj.Status.Description = "Job pod was evicted by the cluster."
474-
} else {
475-
// ErrorOnEviction is disabled. Delete the pod now and recreate it in
476-
// the next resync.
477-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod")
478-
client, ok := r.buildClients[pj.ClusterAlias()]
479-
if !ok {
480-
return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
481-
}
482-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
483-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
484-
oldPod := pod.DeepCopy()
485-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
486-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
487-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
488-
}
489-
}
490-
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
491-
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
492-
}
493-
} else if isPodTerminated(pod) {
494-
// Pod was terminated.
495-
if pj.Spec.ErrorOnTermination {
496-
// ErrorOnTermination is enabled, complete the PJ and mark it as
497-
// errored.
475+
case transientFailure == PodTransientFailureTerminated && pj.Spec.ErrorOnTermination:
476+
// ErrorOnTermination is enabled, complete the PJ and mark it as errored.
498477
r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.")
499478
pj.SetComplete()
500479
pj.Status.State = prowv1.ErrorState
501480
pj.Status.Description = "Job pod's node was terminated."
502-
} else {
503-
// ErrorOnTermination is disabled. Delete the pod now and recreate it in
504-
// the next resync.
505-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, deleting & next sync loop will restart pod")
481+
case pj.Status.RetryCount >= MaxPodRetries:
482+
// MaxPodRetries is reached, complete the PJ and mark it as errored.
483+
r.log.WithField("transient-failure", transientFailure).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
484+
pj.SetComplete()
485+
pj.Status.State = prowv1.ErrorState
486+
pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", MaxPodRetries, transientFailure)
487+
default:
488+
// Update the retry count and delete the pod so it gets recreated in the next resync.
489+
pj.Status.RetryCount++
490+
r.log.
491+
WithField("transientFailure", transientFailure).
492+
WithFields(pjutil.ProwJobFields(pj)).
493+
Info("Pod has transient failure, deleting & next sync loop will restart pod")
494+
506495
client, ok := r.buildClients[pj.ClusterAlias()]
507496
if !ok {
508-
return nil, TerminalError(fmt.Errorf("terminated pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
497+
return nil, TerminalError(fmt.Errorf("pod %s with transient failure %s: unknown cluster alias %q", pod.Name, transientFailure, pj.ClusterAlias()))
509498
}
510-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
499+
if finalizers := sets.New(pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
511500
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
512501
oldPod := pod.DeepCopy()
513502
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
514503
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
515504
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
516505
}
517506
}
518-
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
519-
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
520-
}
521-
} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
522-
// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
523-
// vanishes and we will silently re-create it in the next iteration.
524-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod")
525-
client, ok := r.buildClients[pj.ClusterAlias()]
526-
if !ok {
527-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
528-
}
529-
530-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
531-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532-
oldPod := pod.DeepCopy()
533-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
534-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
535-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
536-
}
537-
}
538507

539-
return nil, nil
540-
} else {
541-
switch pod.Status.Phase {
542-
case corev1.PodUnknown:
543-
// Pod is in Unknown state. This can happen if there is a problem with
544-
// the node. Delete the old pod, this will fire an event that triggers
545-
// a new reconciliation in which we will re-create the pod.
546-
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod")
547-
client, ok := r.buildClients[pj.ClusterAlias()]
548-
if !ok {
549-
return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
508+
// Pod is already deleted, so we don't need to delete it again.
509+
if pod.DeletionTimestamp != nil {
510+
return nil, nil
550511
}
551512

552-
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
553-
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
554-
oldPod := pod.DeepCopy()
555-
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
556-
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
557-
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
558-
}
559-
}
560513
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
561514
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
562-
515+
}
516+
} else {
517+
switch pod.Status.Phase {
563518
case corev1.PodSucceeded:
564519
pj.SetComplete()
565520
// There were bugs around this in the past so be paranoid and verify each container
@@ -701,31 +656,53 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
701656
return nil, nil
702657
}
703658

704-
func isPodTerminated(pod *corev1.Pod) bool {
659+
type PodTransientFailure string
660+
661+
const (
662+
PodTransientFailureNone PodTransientFailure = ""
663+
PodTransientFailureUnknown PodTransientFailure = "unknown"
664+
PodTransientFailureEvicted PodTransientFailure = "evicted"
665+
PodTransientFailureTerminated PodTransientFailure = "terminated"
666+
PodTransientFailureUnreachable PodTransientFailure = "unreachable"
667+
)
668+
669+
func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
670+
if pod.Status.Reason == Evicted {
671+
return PodTransientFailureEvicted
672+
}
673+
705674
// If there was a Graceful node shutdown, the Pod's status will have a
706675
// reason set to "Terminated":
707676
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
708677
if pod.Status.Reason == Terminated {
709-
return true
678+
return PodTransientFailureTerminated
710679
}
711680

712681
for _, condition := range pod.Status.Conditions {
713682
// If the node does no longer exist and the pod gets garbage collected,
714683
// this condition will be set:
715684
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
716685
if condition.Reason == "DeletionByPodGC" {
717-
return true
686+
return PodTransientFailureTerminated
718687
}
719688

720689
// On GCP, before a new spot instance is started, the old pods are garbage
721690
// collected (if they have not been already by the Kubernetes PodGC):
722691
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
723692
if condition.Reason == "DeletionByGCPControllerManager" {
724-
return true
693+
return PodTransientFailureTerminated
725694
}
726695
}
727696

728-
return false
697+
if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
698+
return PodTransientFailureUnreachable
699+
}
700+
701+
if pod.Status.Phase == corev1.PodUnknown {
702+
return PodTransientFailureUnknown
703+
}
704+
705+
return PodTransientFailureNone
729706
}
730707

731708
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running

0 commit comments

Comments
 (0)