@@ -20,14 +20,17 @@ import (
20
20
"context"
21
21
"fmt"
22
22
"reflect"
23
+ "sort"
23
24
"time"
24
25
25
26
"github.com/pkg/errors"
27
+ "golang.org/x/exp/slices"
26
28
corev1 "k8s.io/api/core/v1"
27
29
apierrors "k8s.io/apimachinery/pkg/api/errors"
28
30
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
31
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
30
32
kerrors "k8s.io/apimachinery/pkg/util/errors"
33
+ "k8s.io/apimachinery/pkg/util/intstr"
31
34
"k8s.io/apimachinery/pkg/util/wait"
32
35
"k8s.io/klog/v2"
33
36
"k8s.io/utils/ptr"
@@ -44,6 +47,7 @@ import (
44
47
"sigs.k8s.io/cluster-api/internal/util/ssa"
45
48
"sigs.k8s.io/cluster-api/util"
46
49
"sigs.k8s.io/cluster-api/util/annotations"
50
+ "sigs.k8s.io/cluster-api/util/collections"
47
51
"sigs.k8s.io/cluster-api/util/conditions"
48
52
utilconversion "sigs.k8s.io/cluster-api/util/conversion"
49
53
"sigs.k8s.io/cluster-api/util/labels"
@@ -294,15 +298,18 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
294
298
// Get the nodeRefsMap from the cluster.
295
299
s .nodeRefMap , getNodeRefsErr = r .getNodeRefMap (ctx , clusterClient )
296
300
297
- err = r .reconcileMachines (ctx , s , infraConfig )
301
+ res := ctrl.Result {}
302
+
303
+ reconcileMachinesRes , err := r .reconcileMachines (ctx , s , infraConfig )
304
+ res = util .LowestNonZeroResult (res , reconcileMachinesRes )
298
305
299
306
if err != nil || getNodeRefsErr != nil {
300
307
return ctrl.Result {}, kerrors .NewAggregate ([]error {errors .Wrapf (err , "failed to reconcile Machines for MachinePool %s" , klog .KObj (mp )), errors .Wrapf (getNodeRefsErr , "failed to get nodeRefs for MachinePool %s" , klog .KObj (mp ))})
301
308
}
302
309
303
310
if ! mp .Status .InfrastructureReady {
304
311
log .Info ("Infrastructure provider is not yet ready" , infraConfig .GetKind (), klog .KObj (infraConfig ))
305
- return ctrl. Result {} , nil
312
+ return res , nil
306
313
}
307
314
308
315
var providerIDList []string
@@ -321,7 +328,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
321
328
322
329
if len (providerIDList ) == 0 && mp .Status .Replicas != 0 {
323
330
log .Info ("Retrieved empty spec.providerIDList from infrastructure provider but status.replicas is not zero." , "replicas" , mp .Status .Replicas )
324
- return ctrl. Result {} , nil
331
+ return res , nil
325
332
}
326
333
327
334
if ! reflect .DeepEqual (mp .Spec .ProviderIDList , providerIDList ) {
@@ -331,7 +338,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
331
338
mp .Status .UnavailableReplicas = mp .Status .Replicas
332
339
}
333
340
334
- return ctrl. Result {} , nil
341
+ return res , nil
335
342
}
336
343
337
344
// reconcileMachines reconciles Machines associated with a MachinePool.
@@ -341,18 +348,18 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
341
348
// infrastructure is created accordingly.
342
349
// Note: When supported by the cloud provider implementation of the MachinePool, machines will provide a means to interact
343
350
// with the corresponding infrastructure (e.g. delete a specific machine in case MachineHealthCheck detects it is unhealthy).
344
- func (r * MachinePoolReconciler ) reconcileMachines (ctx context.Context , s * scope , infraMachinePool * unstructured.Unstructured ) error {
351
+ func (r * MachinePoolReconciler ) reconcileMachines (ctx context.Context , s * scope , infraMachinePool * unstructured.Unstructured ) (ctrl. Result , error ) {
345
352
log := ctrl .LoggerFrom (ctx )
346
353
mp := s .machinePool
347
354
348
355
var infraMachineKind string
349
356
if err := util .UnstructuredUnmarshalField (infraMachinePool , & infraMachineKind , "status" , "infrastructureMachineKind" ); err != nil {
350
357
if errors .Is (err , util .ErrUnstructuredFieldNotFound ) {
351
358
log .V (4 ).Info ("MachinePool Machines not supported, no infraMachineKind found" )
352
- return nil
359
+ return ctrl. Result {}, nil
353
360
}
354
361
355
- return errors .Wrapf (err , "failed to retrieve infraMachineKind from infrastructure provider for MachinePool %s" , klog .KObj (mp ))
362
+ return ctrl. Result {}, errors .Wrapf (err , "failed to retrieve infraMachineKind from infrastructure provider for MachinePool %s" , klog .KObj (mp ))
356
363
}
357
364
358
365
infraMachineSelector := metav1.LabelSelector {
@@ -369,7 +376,7 @@ func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope,
369
376
infraMachineList .SetAPIVersion (infraMachinePool .GetAPIVersion ())
370
377
infraMachineList .SetKind (infraMachineKind + "List" )
371
378
if err := r .Client .List (ctx , & infraMachineList , client .InNamespace (mp .Namespace ), client .MatchingLabels (infraMachineSelector .MatchLabels )); err != nil {
372
- return errors .Wrapf (err , "failed to list infra machines for MachinePool %q in namespace %q" , mp .Name , mp .Namespace )
379
+ return ctrl. Result {}, errors .Wrapf (err , "failed to list infra machines for MachinePool %q in namespace %q" , mp .Name , mp .Namespace )
373
380
}
374
381
375
382
// Add watcher for infraMachine, if there isn't one already; this will allow this controller to reconcile
@@ -380,21 +387,26 @@ func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope,
380
387
381
388
// Add watcher for infraMachine, if there isn't one already.
382
389
if err := r .externalTracker .Watch (log , sampleInfraMachine , handler .EnqueueRequestsFromMapFunc (r .infraMachineToMachinePoolMapper )); err != nil {
383
- return err
390
+ return ctrl. Result {}, err
384
391
}
385
392
386
393
// Get the list of machines managed by this controller, and align it with the infra machines managed by
387
394
// the InfraMachinePool controller.
388
395
machineList := & clusterv1.MachineList {}
389
396
if err := r .Client .List (ctx , machineList , client .InNamespace (mp .Namespace ), client .MatchingLabels (infraMachineSelector .MatchLabels )); err != nil {
390
- return err
397
+ return ctrl. Result {}, err
391
398
}
392
399
393
400
if err := r .createOrUpdateMachines (ctx , s , machineList .Items , infraMachineList .Items ); err != nil {
394
- return errors .Wrapf (err , "failed to create machines for MachinePool %q in namespace %q" , mp .Name , mp .Namespace )
401
+ return ctrl. Result {}, errors .Wrapf (err , "failed to create machines for MachinePool %q in namespace %q" , mp .Name , mp .Namespace )
395
402
}
396
403
397
- return nil
404
+ res , err := r .reconcileUnhealthyMachines (ctx , s , machineList .Items )
405
+ if err != nil {
406
+ return ctrl.Result {}, errors .Wrapf (err , "failed to reconcile unhealthy machines for MachinePool %s" , klog .KObj (mp ))
407
+ }
408
+
409
+ return res , nil
398
410
}
399
411
400
412
// createOrUpdateMachines creates a MachinePool Machine for each infraMachine if it doesn't already exist and sets the owner reference and infraRef.
@@ -594,3 +606,121 @@ func (r *MachinePoolReconciler) getNodeRefMap(ctx context.Context, c client.Clie
594
606
595
607
return nodeRefsMap , nil
596
608
}
609
+
610
+ func (r * MachinePoolReconciler ) reconcileUnhealthyMachines (ctx context.Context , s * scope , machines []clusterv1.Machine ) (ctrl.Result , error ) {
611
+ if len (machines ) == 0 {
612
+ return ctrl.Result {}, nil
613
+ }
614
+
615
+ log := ctrl .LoggerFrom (ctx )
616
+ mp := s .machinePool
617
+
618
+ machinesWithHealthCheck := slices .DeleteFunc (slices .Clone (machines ), func (machine clusterv1.Machine ) bool {
619
+ return ! conditions .Has (& machine , clusterv1 .MachineHealthCheckSucceededCondition )
620
+ })
621
+ if len (machinesWithHealthCheck ) == 0 {
622
+ // This means there is no MachineHealthCheck selecting any machines
623
+ // of this machine pool. In this case, do not requeue so often,
624
+ // but still check regularly in case a MachineHealthCheck became
625
+ // deployed or activated. This long interval shouldn't be a problem
626
+ // at cluster creation, since newly-created nodes should anyway
627
+ // trigger MachinePool reconciliation as the infrastructure provider
628
+ // creates the InfraMachines.
629
+ log .V (4 ).Info ("Skipping reconciliation of unhealthy MachinePool machines because there are no health-checked machines" )
630
+ return ctrl.Result {RequeueAfter : 10 * time .Minute }, nil
631
+ }
632
+
633
+ unhealthyMachines := slices .DeleteFunc (slices .Clone (machines ), func (machine clusterv1.Machine ) bool {
634
+ return ! collections .IsUnhealthyAndOwnerRemediated (& machine )
635
+ })
636
+ log .V (4 ).Info ("Reconciling unhealthy MachinePool machines" , "unhealthyMachines" , len (unhealthyMachines ))
637
+
638
+ // Calculate how many in flight machines we should remediate.
639
+ // By default, we allow all machines to be remediated at the same time.
640
+ maxInFlight := len (unhealthyMachines )
641
+ if mp .Spec .Strategy != nil && mp .Spec .Strategy .Remediation != nil {
642
+ if mp .Spec .Strategy .Remediation .MaxInFlight != nil {
643
+ var err error
644
+ replicas := int (ptr .Deref (mp .Spec .Replicas , 1 ))
645
+ maxInFlight , err = intstr .GetScaledValueFromIntOrPercent (mp .Spec .Strategy .Remediation .MaxInFlight , replicas , true )
646
+ if err != nil {
647
+ return ctrl.Result {}, fmt .Errorf ("failed to calculate maxInFlight to remediate machines: %v" , err )
648
+ }
649
+ log = log .WithValues ("maxInFlight" , maxInFlight , "replicas" , replicas )
650
+ }
651
+ }
652
+
653
+ machinesToRemediate := make ([]* clusterv1.Machine , 0 , len (unhealthyMachines ))
654
+ inFlight := 0
655
+ for _ , m := range unhealthyMachines {
656
+ if ! m .DeletionTimestamp .IsZero () {
657
+ if conditions .IsTrue (& m , clusterv1 .MachineOwnerRemediatedCondition ) {
658
+ // Machine has been remediated by this controller and still in flight.
659
+ inFlight ++
660
+ }
661
+ continue
662
+ }
663
+ if conditions .IsFalse (& m , clusterv1 .MachineOwnerRemediatedCondition ) {
664
+ machinesToRemediate = append (machinesToRemediate , & m )
665
+ }
666
+ }
667
+ log = log .WithValues ("inFlight" , inFlight )
668
+
669
+ if len (machinesToRemediate ) == 0 {
670
+ // There's a MachineHealthCheck monitoring the machines, but currently
671
+ // no action to be taken. A machine could require remediation at any
672
+ // time, so use a short interval until next reconciliation.
673
+ return ctrl.Result {RequeueAfter : 30 * time .Second }, nil
674
+ }
675
+
676
+ if inFlight >= maxInFlight {
677
+ log .V (3 ).Info ("Remediation strategy is set, and maximum in flight has been reached" , "machinesToBeRemediated" , len (machinesToRemediate ))
678
+
679
+ // Check soon again whether the already-remediating (= deleting) machines are gone
680
+ // so that more machines can be remediated
681
+ return ctrl.Result {RequeueAfter : 15 * time .Second }, nil
682
+ }
683
+
684
+ // Sort the machines from newest to oldest.
685
+ // We are trying to remediate machines failing to come up first because
686
+ // there is a chance that they are not hosting any workloads (minimize disruption).
687
+ sort .SliceStable (machinesToRemediate , func (i , j int ) bool {
688
+ return machinesToRemediate [i ].CreationTimestamp .After (machinesToRemediate [j ].CreationTimestamp .Time )
689
+ })
690
+
691
+ haveMoreMachinesToRemediate := false
692
+ if len (machinesToRemediate ) > (maxInFlight - inFlight ) {
693
+ haveMoreMachinesToRemediate = true
694
+ log .V (5 ).Info ("Remediation strategy is set, limiting in flight operations" , "machinesToBeRemediated" , len (machinesToRemediate ))
695
+ machinesToRemediate = machinesToRemediate [:(maxInFlight - inFlight )]
696
+ }
697
+
698
+ // Remediate unhealthy machines by deleting them
699
+ var errs []error
700
+ for _ , m := range machinesToRemediate {
701
+ log .Info ("Deleting unhealthy Machine" , "Machine" , klog .KObj (m ))
702
+ patch := client .MergeFrom (m .DeepCopy ())
703
+ if err := r .Client .Delete (ctx , m ); err != nil {
704
+ if apierrors .IsNotFound (err ) {
705
+ continue
706
+ }
707
+ errs = append (errs , errors .Wrapf (err , "failed to delete Machine %s" , klog .KObj (m )))
708
+ continue
709
+ }
710
+ conditions .MarkTrue (m , clusterv1 .MachineOwnerRemediatedCondition )
711
+ if err := r .Client .Status ().Patch (ctx , m , patch ); err != nil && ! apierrors .IsNotFound (err ) {
712
+ errs = append (errs , errors .Wrapf (err , "failed to update status of Machine %s" , klog .KObj (m )))
713
+ }
714
+ }
715
+
716
+ if len (errs ) > 0 {
717
+ return ctrl.Result {}, errors .Wrapf (kerrors .NewAggregate (errs ), "failed to delete unhealthy Machines" )
718
+ }
719
+
720
+ if haveMoreMachinesToRemediate {
721
+ // More machines need remediation, so reconcile again sooner
722
+ return ctrl.Result {RequeueAfter : 15 * time .Second }, nil
723
+ }
724
+
725
+ return ctrl.Result {RequeueAfter : 30 * time .Second }, nil
726
+ }
0 commit comments