Skip to content

Commit da96973

Browse files
committed
better detect pod failure due to node termination
Signed-off-by: Tim Ramlot <[email protected]>
1 parent d3c2b8a commit da96973

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

pkg/plank/reconciler.go

+28-1
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
481481
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
482482
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
483483
}
484-
} else if pod.Status.Reason == Terminated {
484+
} else if isPodTerminated(pod) {
485485
// Pod was terminated.
486486
if pj.Spec.ErrorOnTermination {
487487
// ErrorOnTermination is enabled, complete the PJ and mark it as
@@ -692,6 +692,33 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
692692
return nil, nil
693693
}
694694

695+
func isPodTerminated(pod *corev1.Pod) bool {
696+
// If there was a Graceful node shutdown, the Pod's status will have a
697+
// reason set to "Terminated":
698+
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
699+
if pod.Status.Reason == Terminated {
700+
return true
701+
}
702+
703+
for _, condition := range pod.Status.Conditions {
704+
// If the node does no longer exist and the pod gets garbage collected,
705+
// this condition will be set:
706+
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
707+
if condition.Reason == "DeletionByPodGC" {
708+
return true
709+
}
710+
711+
// On GCP, before a new spot instance is started, the old pods are garbage
712+
// collected (if they have not been already by the Kubernetes PodGC):
713+
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
714+
if condition.Reason == "DeletionByGCPControllerManager" {
715+
return true
716+
}
717+
}
718+
719+
return false
720+
}
721+
695722
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
696723
func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
697724
prevPJ := pj.DeepCopy()

0 commit comments

Comments
 (0)