Skip to content

Commit cb50220

Browse files
committed
add option to restart prowjob when node is terminated (enabled by default)
Signed-off-by: Tim Ramlot <[email protected]>
1 parent 0a35181 commit cb50220

File tree

8 files changed

+158
-7
lines changed

8 files changed

+158
-7
lines changed

config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,12 @@ spec:
484484
the job is evicted. If this field is unspecified or false, a new
485485
pod will be created to replace the evicted one.
486486
type: boolean
487+
error_on_termination:
488+
description: ErrorOnTermination indicates that the ProwJob should
489+
be completed and given the ErrorState status if the pod that is
490+
executing the job is terminated. If this field is unspecified or
491+
false, a new pod will be created to replace the terminated one.
492+
type: boolean
487493
extra_refs:
488494
description: ExtraRefs are auxiliary repositories that need to be
489495
cloned, determined from config

pkg/apis/prowjobs/v1/types.go

+5
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ type ProwJobSpec struct {
182182
// If this field is unspecified or false, a new pod will be created to replace
183183
// the evicted one.
184184
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
185+
// ErrorOnTermination indicates that the ProwJob should be completed and given
186+
// the ErrorState status if the pod that is executing the job is terminated.
187+
// If this field is unspecified or false, a new pod will be created to replace
188+
// the terminated one.
189+
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
185190

186191
// PodSpec provides the basis for running the test under
187192
// a Kubernetes agent

pkg/config/config.go

+2
Original file line numberDiff line numberDiff line change
@@ -2894,6 +2894,8 @@ func validateAgent(v JobBase, podNamespace string) error {
28942894
return fmt.Errorf("decoration requires agent: %s (found %q)", k, agent)
28952895
case v.ErrorOnEviction && agent != k:
28962896
return fmt.Errorf("error_on_eviction only applies to agent: %s (found %q)", k, agent)
2897+
case v.ErrorOnTermination && agent != k:
2898+
return fmt.Errorf("error_on_termination only applies to agent: %s (found %q)", k, agent)
28972899
case v.Namespace == nil || *v.Namespace == "":
28982900
return fmt.Errorf("failed to default namespace")
28992901
case *v.Namespace != podNamespace && agent != p:

pkg/config/config_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,13 @@ func TestValidateAgent(t *testing.T) {
14281428
},
14291429
pass: true,
14301430
},
1431+
{
1432+
name: "error_on_termination allowed for kubernetes agent",
1433+
base: func(j *JobBase) {
1434+
j.ErrorOnTermination = true
1435+
},
1436+
pass: true,
1437+
},
14311438
}
14321439

14331440
for _, tc := range cases {

pkg/config/jobs.go

+5
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ type JobBase struct {
109109
// If this field is unspecified or false, a new pod will be created to replace
110110
// the evicted one.
111111
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
112+
// ErrorOnTermination indicates that the ProwJob should be completed and given
113+
// the ErrorState status if the pod that is executing the job is terminated.
114+
// If this field is unspecified or false, a new pod will be created to replace
115+
// the terminated one.
116+
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
112117
// SourcePath contains the path where this job is defined
113118
SourcePath string `json:"-"`
114119
// Spec is the Kubernetes pod spec used if Agent is kubernetes.

pkg/pjutil/pjutil.go

+7-6
Original file line numberDiff line numberDiff line change
@@ -228,12 +228,13 @@ func specFromJobBase(jb config.JobBase) prowapi.ProwJobSpec {
228228
namespace = *jb.Namespace
229229
}
230230
return prowapi.ProwJobSpec{
231-
Job: jb.Name,
232-
Agent: prowapi.ProwJobAgent(jb.Agent),
233-
Cluster: jb.Cluster,
234-
Namespace: namespace,
235-
MaxConcurrency: jb.MaxConcurrency,
236-
ErrorOnEviction: jb.ErrorOnEviction,
231+
Job: jb.Name,
232+
Agent: prowapi.ProwJobAgent(jb.Agent),
233+
Cluster: jb.Cluster,
234+
Namespace: namespace,
235+
MaxConcurrency: jb.MaxConcurrency,
236+
ErrorOnEviction: jb.ErrorOnEviction,
237+
ErrorOnTermination: jb.ErrorOnTermination,
237238

238239
ExtraRefs: DecorateExtraRefs(jb.ExtraRefs, jb),
239240
DecorationConfig: jb.DecorationConfig,

pkg/plank/controller_test.go

+96
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,102 @@ func TestSyncPendingJob(t *testing.T) {
11671167
ExpectedNumPods: 1,
11681168
ExpectedURL: "boop-42/error",
11691169
},
1170+
{
1171+
Name: "delete terminated pod",
1172+
PJ: prowapi.ProwJob{
1173+
ObjectMeta: metav1.ObjectMeta{
1174+
Name: "boop-42",
1175+
Namespace: "prowjobs",
1176+
},
1177+
Spec: prowapi.ProwJobSpec{
1178+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1179+
},
1180+
Status: prowapi.ProwJobStatus{
1181+
State: prowapi.PendingState,
1182+
PodName: "boop-42",
1183+
},
1184+
},
1185+
Pods: []v1.Pod{
1186+
{
1187+
ObjectMeta: metav1.ObjectMeta{
1188+
Name: "boop-42",
1189+
Namespace: "pods",
1190+
},
1191+
Status: v1.PodStatus{
1192+
Phase: v1.PodFailed,
1193+
Reason: Terminated,
1194+
},
1195+
},
1196+
},
1197+
ExpectedComplete: false,
1198+
ExpectedState: prowapi.PendingState,
1199+
ExpectedNumPods: 0,
1200+
},
1201+
{
1202+
Name: "delete terminated pod and remove its k8sreporter finalizer",
1203+
PJ: prowapi.ProwJob{
1204+
ObjectMeta: metav1.ObjectMeta{
1205+
Name: "boop-42",
1206+
Namespace: "prowjobs",
1207+
},
1208+
Spec: prowapi.ProwJobSpec{
1209+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1210+
},
1211+
Status: prowapi.ProwJobStatus{
1212+
State: prowapi.PendingState,
1213+
PodName: "boop-42",
1214+
},
1215+
},
1216+
Pods: []v1.Pod{
1217+
{
1218+
ObjectMeta: metav1.ObjectMeta{
1219+
Name: "boop-42",
1220+
Namespace: "pods",
1221+
Finalizers: []string{"prow.x-k8s.io/gcsk8sreporter"},
1222+
},
1223+
Status: v1.PodStatus{
1224+
Phase: v1.PodFailed,
1225+
Reason: Terminated,
1226+
},
1227+
},
1228+
},
1229+
ExpectedComplete: false,
1230+
ExpectedState: prowapi.PendingState,
1231+
ExpectedNumPods: 0,
1232+
},
1233+
{
1234+
Name: "don't delete terminated pod w/ error_on_termination, complete PJ instead",
1235+
PJ: prowapi.ProwJob{
1236+
ObjectMeta: metav1.ObjectMeta{
1237+
Name: "boop-42",
1238+
Namespace: "prowjobs",
1239+
},
1240+
Spec: prowapi.ProwJobSpec{
1241+
ErrorOnTermination: true,
1242+
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
1243+
},
1244+
Status: prowapi.ProwJobStatus{
1245+
State: prowapi.PendingState,
1246+
PodName: "boop-42",
1247+
},
1248+
},
1249+
Pods: []v1.Pod{
1250+
{
1251+
ObjectMeta: metav1.ObjectMeta{
1252+
Name: "boop-42",
1253+
Namespace: "pods",
1254+
},
1255+
Status: v1.PodStatus{
1256+
Phase: v1.PodFailed,
1257+
Reason: Terminated,
1258+
},
1259+
},
1260+
},
1261+
ExpectedComplete: true,
1262+
ExpectedState: prowapi.ErrorState,
1263+
ExpectedNumPods: 1,
1264+
ExpectedURL: "boop-42/error",
1265+
},
11701266
{
11711267
Name: "running pod",
11721268
PJ: prowapi.ProwJob{

pkg/plank/reconciler.go

+30-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ const ControllerName = "plank"
6161

6262
// PodStatus constants
6363
const (
64-
Evicted = "Evicted"
64+
Evicted = "Evicted"
65+
Terminated = "Terminated"
6566
)
6667

6768
// NodeStatus constants
@@ -480,6 +481,34 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
480481
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
481482
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
482483
}
484+
} else if pod.Status.Reason == Terminated {
485+
// Pod was terminated.
486+
if pj.Spec.ErrorOnTermination {
487+
// ErrorOnTermination is enabled, complete the PJ and mark it as
488+
// errored.
489+
r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.")
490+
pj.SetComplete()
491+
pj.Status.State = prowv1.ErrorState
492+
pj.Status.Description = "Job pod's node was terminated."
493+
} else {
494+
// ErrorOnTermination is disabled. Delete the pod now and recreate it in
495+
// the next resync.
496+
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, deleting & next sync loop will restart pod")
497+
client, ok := r.buildClients[pj.ClusterAlias()]
498+
if !ok {
499+
return nil, TerminalError(fmt.Errorf("terminated pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
500+
}
501+
if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
502+
// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
503+
oldPod := pod.DeepCopy()
504+
pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
505+
if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
506+
return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
507+
}
508+
}
509+
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
510+
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
511+
}
483512
} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
484513
// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
485514
// vanishes and we will silently re-create it in the next iteration.

0 commit comments

Comments
 (0)