Skip to content

Commit e6fa01e

Browse files
authored
MGMT-10968: Replace agentmachine finalizer with machine pre-delete hook (openshift#54)
* Update assisted-service api and models to latest * Watch for updates to machines related to an agentmachine * Return machine from newAgentMachine test helper * Replace agentmachine finalizer with machine pre-delete hook Previously when a machine was deleted CAPI would cordon/drain the node, then wait on the finalizer for our agentmachine resource. This chain of events doesn't allow assisted-service time to run a pod on the node to reboot it into discovery. This commit adds a PreTerminateDeleteHook to the machine resource instead of a finalizer on the agent machine. When the agentmachine controller sees that the machine is waiting on the hook, it unbinds the agent and waits for the assisted-service to complete the reclaim work. Once the reclaim has completed or failed, the agentmachine controller removes the hook annotation. This requires specifically the PreTerminateDeleteHook rather than the PreDrain one because once we start the agent on the cluster we won't get a chance to drain the node before it is rebooted into discovery. https://issues.redhat.com/browse/MGMT-10968
1 parent 7266349 commit e6fa01e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2236
-348
lines changed

controllers/agentmachine_controller.go

+141-50
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ import (
2323
"fmt"
2424
"strconv"
2525
"strings"
26+
"time"
2627

2728
ignitionapi "github.com/coreos/ignition/v2/config/v3_1/types"
2829
"github.com/go-openapi/swag"
2930
aiv1beta1 "github.com/openshift/assisted-service/api/v1beta1"
31+
aimodels "github.com/openshift/assisted-service/models"
3032
capiproviderv1alpha1 "github.com/openshift/cluster-api-provider-agent/api/v1alpha1"
3133
openshiftconditionsv1 "github.com/openshift/custom-resource-status/conditions/v1"
3234
"github.com/sirupsen/logrus"
@@ -36,6 +38,7 @@ import (
3638
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3739
"k8s.io/apimachinery/pkg/labels"
3840
"k8s.io/apimachinery/pkg/runtime"
41+
"k8s.io/apimachinery/pkg/runtime/schema"
3942
"k8s.io/apimachinery/pkg/selection"
4043
"k8s.io/apimachinery/pkg/types"
4144
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
@@ -53,6 +56,8 @@ const (
5356
AgentMachineFinalizerName = "agentmachine." + aiv1beta1.Group + "/deprovision"
5457
AgentMachineRefLabelKey = "agentMachineRef"
5558
AgentMachineRefNamespace = "agentMachineRefNamespace"
59+
60+
machineDeleteHookName = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/agentmachine"
5661
)
5762

5863
// AgentMachineReconciler reconciles a AgentMachine object
@@ -89,16 +94,6 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
8994
return ctrl.Result{}, client.IgnoreNotFound(err)
9095
}
9196

92-
res, err := r.handleDeletionFinalizer(ctx, log, agentMachine)
93-
if res != nil || err != nil {
94-
return *res, err
95-
}
96-
97-
// If the AgentMachine is ready, we have nothing to do
98-
if agentMachine.Status.Ready {
99-
return ctrl.Result{}, nil
100-
}
101-
10297
machine, err := clusterutil.GetOwnerMachine(ctx, r.Client, agentMachine.ObjectMeta)
10398
if err != nil {
10499
log.WithError(err).Error("failed to get owner machine")
@@ -109,6 +104,16 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
109104
return r.updateStatus(ctx, log, agentMachine, ctrl.Result{}, nil)
110105
}
111106

107+
res, err := r.handleDeletionHook(ctx, log, agentMachine, machine)
108+
if res != nil || err != nil {
109+
return *res, err
110+
}
111+
112+
// If the AgentMachine is ready, we have nothing to do
113+
if agentMachine.Status.Ready {
114+
return ctrl.Result{}, nil
115+
}
116+
112117
agentCluster, err := r.getAgentCluster(ctx, log, machine)
113118
if err != nil {
114119
return ctrl.Result{}, err
@@ -141,51 +146,101 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
141146
return r.updateStatus(ctx, log, agentMachine, ctrl.Result{}, nil)
142147
}
143148

144-
func (r *AgentMachineReconciler) handleDeletionFinalizer(ctx context.Context, log logrus.FieldLogger, agentMachine *capiproviderv1alpha1.AgentMachine) (*ctrl.Result, error) {
145-
if agentMachine.ObjectMeta.DeletionTimestamp.IsZero() { // AgentMachine not being deleted
146-
// Register a finalizer if it is absent.
147-
if !funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
148-
controllerutil.AddFinalizer(agentMachine, AgentMachineFinalizerName)
149-
if err := r.Update(ctx, agentMachine); err != nil {
150-
log.WithError(err).Errorf("failed to add finalizer %s to resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
151-
return &ctrl.Result{}, err
152-
}
149+
func (r *AgentMachineReconciler) removeMachineDeletionHookAnnotation(ctx context.Context, machine *clusterv1.Machine) (err error) {
150+
annotations := machine.GetAnnotations()
151+
if _, haveMachineHookAnnotation := annotations[machineDeleteHookName]; haveMachineHookAnnotation {
152+
delete(annotations, machineDeleteHookName)
153+
machine.SetAnnotations(annotations)
154+
err = r.Update(ctx, machine)
155+
}
156+
return err
157+
}
158+
159+
func (r *AgentMachineReconciler) handleDeletionHook(ctx context.Context, log logrus.FieldLogger, agentMachine *capiproviderv1alpha1.AgentMachine, machine *clusterv1.Machine) (*ctrl.Result, error) {
160+
// TODO: this can be removed when we're sure no agent machines have this finalizer anymore
161+
if funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
162+
controllerutil.RemoveFinalizer(agentMachine, AgentMachineFinalizerName)
163+
if err := r.Update(ctx, agentMachine); err != nil {
164+
log.WithError(err).Errorf("failed to remove finalizer %s from resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
165+
return &ctrl.Result{}, err
153166
}
154-
} else { // AgentMachine is being deleted
155-
r.Log.Info("Found deletion timestamp on AgentMachine")
156-
if funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
157-
// deletion finalizer found, unbind the Agent from the ClusterDeployment
158-
if agentMachine.Status.AgentRef != nil {
159-
r.Log.Info("Removing ClusterDeployment ref to unbind Agent")
160-
agent, err := r.getAgent(ctx, log, agentMachine)
161-
if err != nil {
162-
if apierrors.IsNotFound(err) {
163-
log.WithError(err).Infof("Failed to get agent %s. assuming the agent no longer exists", agentMachine.Status.AgentRef.Name)
164-
} else {
165-
log.WithError(err).Errorf("Failed to get agent %s", agentMachine.Status.AgentRef.Name)
166-
return &ctrl.Result{}, err
167-
}
168-
} else {
169-
// Remove the AgentMachineRefLabel and set the clusterDeployment to nil
170-
delete(agent.ObjectMeta.Labels, AgentMachineRefLabelKey)
171-
agent.Spec.ClusterDeploymentName = nil
172-
if err := r.Update(ctx, agent); err != nil {
173-
log.WithError(err).Error("failed to remove the Agent's ClusterDeployment ref")
174-
return &ctrl.Result{}, err
175-
}
176-
}
177-
}
167+
}
168+
169+
// set delete hook if not present and machine not being deleted
170+
annotations := machine.GetAnnotations()
171+
if _, haveMachineHookAnnotation := annotations[machineDeleteHookName]; !haveMachineHookAnnotation && machine.DeletionTimestamp == nil {
172+
if annotations == nil {
173+
annotations = make(map[string]string)
174+
}
175+
annotations[machineDeleteHookName] = ""
176+
machine.SetAnnotations(annotations)
177+
if err := r.Update(ctx, machine); err != nil {
178+
log.WithError(err).Error("failed to add machine delete hook annotation")
179+
return &ctrl.Result{}, err
180+
}
181+
// return early here as there's no reason to check if the machine is held up on this hook as we just created it
182+
return nil, nil
183+
}
184+
185+
// return if the machine is not waiting on this hook
186+
cond := conditions.Get(machine, clusterv1.PreTerminateDeleteHookSucceededCondition)
187+
if cond == nil || cond.Status == corev1.ConditionTrue {
188+
return nil, nil
189+
}
190+
191+
if agentMachine.Status.AgentRef == nil {
192+
log.Info("Removing machine delete hook annotation - agent ref is nil")
193+
if err := r.removeMachineDeletionHookAnnotation(ctx, machine); err != nil {
194+
log.WithError(err).Error("failed to remove machine delete hook annotation")
195+
return &ctrl.Result{}, err
196+
}
197+
return nil, nil
198+
}
178199

179-
// remove our finalizer from the list and update it.
180-
controllerutil.RemoveFinalizer(agentMachine, AgentMachineFinalizerName)
181-
if err := r.Update(ctx, agentMachine); err != nil {
182-
log.WithError(err).Errorf("failed to remove finalizer %s from resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
183-
return &ctrl.Result{}, err
200+
agent, err := r.getAgent(ctx, log, agentMachine)
201+
if err != nil {
202+
if apierrors.IsNotFound(err) {
203+
log.WithError(err).Infof("Failed to get agent %s. assuming the agent no longer exists", agentMachine.Status.AgentRef.Name)
204+
if hookErr := r.removeMachineDeletionHookAnnotation(ctx, machine); hookErr != nil {
205+
log.WithError(hookErr).Error("failed to remove machine delete hook annotation")
206+
return &ctrl.Result{}, hookErr
184207
}
208+
return nil, nil
209+
} else {
210+
log.WithError(err).Errorf("Failed to get agent %s", agentMachine.Status.AgentRef.Name)
211+
return &ctrl.Result{}, err
212+
}
213+
}
214+
215+
if funk.Contains(agent.ObjectMeta.Labels, AgentMachineRefLabelKey) || agent.Spec.ClusterDeploymentName != nil {
216+
r.Log.Info("Removing ClusterDeployment ref to unbind Agent")
217+
delete(agent.ObjectMeta.Labels, AgentMachineRefLabelKey)
218+
agent.Spec.ClusterDeploymentName = nil
219+
if err := r.Update(ctx, agent); err != nil {
220+
log.WithError(err).Error("failed to remove the Agent's ClusterDeployment ref")
221+
return &ctrl.Result{}, err
185222
}
186-
r.Log.Info("AgentMachine is ready for deletion")
223+
}
187224

188-
return &ctrl.Result{}, nil
225+
// Remove the hook when either the host is back to some kind of unbound state, reclaim fails, or is not enabled
226+
removeHookStates := []string{
227+
aimodels.HostStatusDiscoveringUnbound,
228+
aimodels.HostStatusKnownUnbound,
229+
aimodels.HostStatusDisconnectedUnbound,
230+
aimodels.HostStatusInsufficientUnbound,
231+
aimodels.HostStatusDisabledUnbound,
232+
aimodels.HostStatusUnbindingPendingUserAction,
233+
aimodels.HostStatusError,
234+
}
235+
if funk.Contains(removeHookStates, agent.Status.DebugInfo.State) {
236+
log.Infof("Removing machine delete hook annotation for agent in status %s", agent.Status.DebugInfo.State)
237+
if err := r.removeMachineDeletionHookAnnotation(ctx, machine); err != nil {
238+
log.WithError(err).Error("failed to remove machine delete hook annotation")
239+
return &ctrl.Result{}, err
240+
}
241+
} else {
242+
log.Infof("Waiting for agent %s to reboot into discovery", agent.Name)
243+
return &ctrl.Result{RequeueAfter: 5 * time.Second}, nil
189244
}
190245

191246
return nil, nil
@@ -538,6 +593,41 @@ func getAddresses(foundAgent *aiv1beta1.Agent) []clusterv1.MachineAddress {
538593
return machineAddresses
539594
}
540595

596+
func (r *AgentMachineReconciler) mapMachineToAgentMachine(machine client.Object) []reconcile.Request {
597+
log := r.Log.WithFields(
598+
logrus.Fields{
599+
"machine": machine.GetName(),
600+
"machine_namespace": machine.GetNamespace(),
601+
},
602+
)
603+
604+
amList := &capiproviderv1alpha1.AgentMachineList{}
605+
opts := &client.ListOptions{
606+
Namespace: machine.GetNamespace(),
607+
}
608+
if err := r.List(context.Background(), amList, opts); err != nil {
609+
log.Debugf("failed to list agent machines")
610+
return []reconcile.Request{}
611+
}
612+
613+
for _, agentMachine := range amList.Items {
614+
for _, ref := range agentMachine.OwnerReferences {
615+
gv, err := schema.ParseGroupVersion(ref.APIVersion)
616+
if err != nil {
617+
continue
618+
}
619+
if ref.Kind == "Machine" && gv.Group == clusterv1.GroupVersion.Group && ref.Name == machine.GetName() {
620+
return []reconcile.Request{{NamespacedName: types.NamespacedName{
621+
Namespace: agentMachine.Namespace,
622+
Name: agentMachine.Name,
623+
}}}
624+
}
625+
}
626+
}
627+
628+
return []reconcile.Request{}
629+
}
630+
541631
// SetupWithManager sets up the controller with the Manager.
542632
func (r *AgentMachineReconciler) SetupWithManager(mgr ctrl.Manager, agentNamespace string) error {
543633
mapAgentToAgentMachine := func(agent client.Object) []reconcile.Request {
@@ -577,5 +667,6 @@ func (r *AgentMachineReconciler) SetupWithManager(mgr ctrl.Manager, agentNamespa
577667
return ctrl.NewControllerManagedBy(mgr).
578668
For(&capiproviderv1alpha1.AgentMachine{}).
579669
Watches(&source.Kind{Type: &aiv1beta1.Agent{}}, handler.EnqueueRequestsFromMapFunc(mapAgentToAgentMachine)).
670+
Watches(&source.Kind{Type: &clusterv1.Machine{}}, handler.EnqueueRequestsFromMapFunc(r.mapMachineToAgentMachine)).
580671
Complete(r)
581672
}

0 commit comments

Comments
 (0)