@@ -60,6 +60,8 @@ import (
60
60
61
61
const ControllerName = "plank"
62
62
63
+ const MaxPodRetries = 3
64
+
63
65
// PodStatus constants
64
66
const (
65
67
Evicted = "Evicted"
@@ -462,104 +464,57 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
462
464
pj .Status .PodName = pn
463
465
r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is missing, starting a new pod" )
464
466
}
465
- } else if pod .Status .Reason == Evicted {
466
- // Pod was evicted.
467
- if pj .Spec .ErrorOnEviction {
468
- // ErrorOnEviction is enabled, complete the PJ and mark it as
469
- // errored.
467
+ } else if transientFailure := getTransientFailure (pod ); transientFailure != PodTransientFailureNone {
468
+ switch {
469
+ case transientFailure == PodTransientFailureEvicted && pj .Spec .ErrorOnEviction :
470
+ // ErrorOnEviction is enabled, complete the PJ and mark it as errored.
470
471
r .log .WithField ("error-on-eviction" , true ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, fail job." )
471
472
pj .SetComplete ()
472
473
pj .Status .State = prowv1 .ErrorState
473
474
pj .Status .Description = "Job pod was evicted by the cluster."
474
- } else {
475
- // ErrorOnEviction is disabled. Delete the pod now and recreate it in
476
- // the next resync.
477
- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got evicted, deleting & next sync loop will restart pod" )
478
- client , ok := r .buildClients [pj .ClusterAlias ()]
479
- if ! ok {
480
- return nil , TerminalError (fmt .Errorf ("evicted pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
481
- }
482
- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
483
- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
484
- oldPod := pod .DeepCopy ()
485
- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
486
- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
487
- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
488
- }
489
- }
490
- r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
491
- return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
492
- }
493
- } else if isPodTerminated (pod ) {
494
- // Pod was terminated.
495
- if pj .Spec .ErrorOnTermination {
496
- // ErrorOnTermination is enabled, complete the PJ and mark it as
497
- // errored.
475
+ case transientFailure == PodTransientFailureTerminated && pj .Spec .ErrorOnTermination :
476
+ // ErrorOnTermination is enabled, complete the PJ and mark it as errored.
498
477
r .log .WithField ("error-on-termination" , true ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got terminated, fail job." )
499
478
pj .SetComplete ()
500
479
pj .Status .State = prowv1 .ErrorState
501
480
pj .Status .Description = "Job pod's node was terminated."
502
- } else {
503
- // ErrorOnTermination is disabled. Delete the pod now and recreate it in
504
- // the next resync.
505
- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got terminated, deleting & next sync loop will restart pod" )
481
+ case pj .Status .RetryCount >= MaxPodRetries :
482
+ // MaxPodRetries is reached, complete the PJ and mark it as errored.
483
+ r .log .WithField ("transient-failure" , transientFailure ).WithFields (pjutil .ProwJobFields (pj )).Info ("Pod Node reached max retries, fail job." )
484
+ pj .SetComplete ()
485
+ pj .Status .State = prowv1 .ErrorState
486
+ pj .Status .Description = fmt .Sprintf ("Job pod reached max retries (%d) for transient failure %s" , MaxPodRetries , transientFailure )
487
+ default :
488
+ // Update the retry count and delete the pod so it gets recreated in the next resync.
489
+ pj .Status .RetryCount ++
490
+ r .log .
491
+ WithField ("transientFailure" , transientFailure ).
492
+ WithFields (pjutil .ProwJobFields (pj )).
493
+ Info ("Pod has transient failure, deleting & next sync loop will restart pod" )
494
+
506
495
client , ok := r .buildClients [pj .ClusterAlias ()]
507
496
if ! ok {
508
- return nil , TerminalError (fmt .Errorf ("terminated pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
497
+ return nil , TerminalError (fmt .Errorf ("pod %s with transient failure %s : unknown cluster alias %q" , pod .Name , transientFailure , pj .ClusterAlias ()))
509
498
}
510
- if finalizers := sets .New [ string ] (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
499
+ if finalizers := sets .New (pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
511
500
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
512
501
oldPod := pod .DeepCopy ()
513
502
pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
514
503
if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
515
504
return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
516
505
}
517
506
}
518
- r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
519
- return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
520
- }
521
- } else if pod .DeletionTimestamp != nil && pod .Status .Reason == NodeUnreachablePodReason {
522
- // This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
523
- // vanishes and we will silently re-create it in the next iteration.
524
- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pods Node got lost, deleting & next sync loop will restart pod" )
525
- client , ok := r .buildClients [pj .ClusterAlias ()]
526
- if ! ok {
527
- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
528
- }
529
-
530
- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
531
- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
532
- oldPod := pod .DeepCopy ()
533
- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
534
- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
535
- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
536
- }
537
- }
538
507
539
- return nil , nil
540
- } else {
541
- switch pod .Status .Phase {
542
- case corev1 .PodUnknown :
543
- // Pod is in Unknown state. This can happen if there is a problem with
544
- // the node. Delete the old pod, this will fire an event that triggers
545
- // a new reconciliation in which we will re-create the pod.
546
- r .log .WithFields (pjutil .ProwJobFields (pj )).Info ("Pod is in unknown state, deleting & restarting pod" )
547
- client , ok := r .buildClients [pj .ClusterAlias ()]
548
- if ! ok {
549
- return nil , TerminalError (fmt .Errorf ("unknown pod %s: unknown cluster alias %q" , pod .Name , pj .ClusterAlias ()))
508
+ // Pod is already deleted, so we don't need to delete it again.
509
+ if pod .DeletionTimestamp != nil {
510
+ return nil , nil
550
511
}
551
512
552
- if finalizers := sets .New [string ](pod .Finalizers ... ); finalizers .Has (kubernetesreporterapi .FinalizerName ) {
553
- // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
554
- oldPod := pod .DeepCopy ()
555
- pod .Finalizers = finalizers .Delete (kubernetesreporterapi .FinalizerName ).UnsortedList ()
556
- if err := client .Patch (ctx , pod , ctrlruntimeclient .MergeFrom (oldPod )); err != nil {
557
- return nil , fmt .Errorf ("failed to patch pod trying to remove %s finalizer: %w" , kubernetesreporterapi .FinalizerName , err )
558
- }
559
- }
560
513
r .log .WithField ("name" , pj .ObjectMeta .Name ).Debug ("Delete Pod." )
561
514
return nil , ctrlruntimeclient .IgnoreNotFound (client .Delete (ctx , pod ))
562
-
515
+ }
516
+ } else {
517
+ switch pod .Status .Phase {
563
518
case corev1 .PodSucceeded :
564
519
pj .SetComplete ()
565
520
// There were bugs around this in the past so be paranoid and verify each container
@@ -701,31 +656,53 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
701
656
return nil , nil
702
657
}
703
658
704
- func isPodTerminated (pod * corev1.Pod ) bool {
659
+ type PodTransientFailure string
660
+
661
+ const (
662
+ PodTransientFailureNone PodTransientFailure = ""
663
+ PodTransientFailureUnknown PodTransientFailure = "unknown"
664
+ PodTransientFailureEvicted PodTransientFailure = "evicted"
665
+ PodTransientFailureTerminated PodTransientFailure = "terminated"
666
+ PodTransientFailureUnreachable PodTransientFailure = "unreachable"
667
+ )
668
+
669
+ func getTransientFailure (pod * corev1.Pod ) PodTransientFailure {
670
+ if pod .Status .Reason == Evicted {
671
+ return PodTransientFailureEvicted
672
+ }
673
+
705
674
// If there was a Graceful node shutdown, the Pod's status will have a
706
675
// reason set to "Terminated":
707
676
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
708
677
if pod .Status .Reason == Terminated {
709
- return true
678
+ return PodTransientFailureTerminated
710
679
}
711
680
712
681
for _ , condition := range pod .Status .Conditions {
713
682
// If the node does no longer exist and the pod gets garbage collected,
714
683
// this condition will be set:
715
684
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
716
685
if condition .Reason == "DeletionByPodGC" {
717
- return true
686
+ return PodTransientFailureTerminated
718
687
}
719
688
720
689
// On GCP, before a new spot instance is started, the old pods are garbage
721
690
// collected (if they have not been already by the Kubernetes PodGC):
722
691
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
723
692
if condition .Reason == "DeletionByGCPControllerManager" {
724
- return true
693
+ return PodTransientFailureTerminated
725
694
}
726
695
}
727
696
728
- return false
697
+ if pod .Status .Reason == NodeUnreachablePodReason && pod .DeletionTimestamp != nil {
698
+ return PodTransientFailureUnreachable
699
+ }
700
+
701
+ if pod .Status .Phase == corev1 .PodUnknown {
702
+ return PodTransientFailureUnknown
703
+ }
704
+
705
+ return PodTransientFailureNone
729
706
}
730
707
731
708
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
0 commit comments