Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit db2fd80

Browse files
committedApr 18, 2024··
add option to restart prowjob when node is terminated (enabled by default)
Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com>
1 parent 0a35181 commit db2fd80

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed
 

‎pkg/apis/prowjobs/v1/types.go

+5
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ type ProwJobSpec struct {
182182
// If this field is unspecified or false, a new pod will be created to replace
183183
// the evicted one.
184184
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
185+
// ErrorOnTermination indicates that the ProwJob should be completed and given
186+
// the ErrorState status if the pod that is executing the job is terminated.
187+
// If this field is unspecified or false, a new pod will be created to replace
188+
// the terminated one.
189+
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
185190

186191
// PodSpec provides the basis for running the test under
187192
// a Kubernetes agent

‎pkg/plank/reconciler.go

+30-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ const ControllerName = "plank"
6161

6262
// PodStatus constants
6363
const (
64-
Evicted = "Evicted"
64+
Evicted = "Evicted"
65+
Terminated = "Terminated"
6566
)
6667

6768
// NodeStatus constants
@@ -480,6 +481,34 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
480481
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
481482
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
482483
}
484+
} else if pod.Status.Reason == Terminated {
485+
// Pod was terminated.
486+
if pj.Spec.ErrorOnTermination {
487+
// ErrorOnTermination is enabled, complete the PJ and mark it as
488+
// errored.
489+
r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.")
490+
pj.SetComplete()
491+
pj.Status.State = prowv1.ErrorState
492+
pj.Status.Description = "Job pod's node was terminated."
493+
} else {
494+
// ErrorOnTermination is disabled. Delete the pod now and recreate it in
495+
// the next resync.
496+
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, deleting & next sync loop will restart pod")
497+
client, ok := r.buildClients[pj.ClusterAlias()]
498+
if !ok {
499+
return nil, TerminalError(fmt.Errorf("terminated pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
500+
}
501+
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
502+
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
503+
oldPod := pod.DeepCopy()
504+
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
505+
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
506+
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
507+
}
508+
}
509+
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
510+
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
511+
}
483512
} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
484513
// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
485514
// vanishes and we will silently re-create it in the next iteration.

0 commit comments

Comments
 (0)
Please sign in to comment.