Skip to content

Commit 7925a4b

Browse files
authored
Merge pull request #2006 from shiftstack/issue1792
🐛 Don't try to resolve machine on delete if cluster not ready
2 parents c834e9c + 9ebb706 commit 7925a4b

7 files changed

+576
-227
lines changed

controllers/openstackcluster_controller.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,18 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
290290
}
291291
}
292292

293-
if instanceStatus != nil {
293+
// If no instance was created we currently need to check for orphaned
294+
// volumes. This requires resolving the instance spec.
295+
// TODO: write volumes to status resources on creation so this is no longer required.
296+
if instanceStatus == nil {
297+
instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
298+
if err != nil {
299+
return err
300+
}
301+
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
302+
return fmt.Errorf("delete volumes: %w", err)
303+
}
304+
} else {
294305
instanceNS, err := instanceStatus.NetworkStatus()
295306
if err != nil {
296307
return err
@@ -307,11 +318,7 @@ func deleteBastion(scope *scope.WithLogger, cluster *clusterv1.Cluster, openStac
307318
}
308319
}
309320

310-
instanceSpec, err := bastionToInstanceSpec(openStackCluster, cluster)
311-
if err != nil {
312-
return err
313-
}
314-
if err = computeService.DeleteInstance(openStackCluster, instanceStatus, instanceSpec); err != nil {
321+
if err = computeService.DeleteInstance(openStackCluster, instanceStatus); err != nil {
315322
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete bastion: %w", err))
316323
return fmt.Errorf("failed to delete bastion: %w", err)
317324
}

controllers/openstackcluster_controller_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ var _ = Describe("OpenStackCluster controller", func() {
208208
testCluster.Status = infrav1.OpenStackClusterStatus{
209209
Bastion: &infrav1.BastionStatus{
210210
ID: "bastion-uuid",
211+
Resolved: &infrav1.ResolvedMachineSpec{
212+
ImageID: "imageID",
213+
},
211214
},
212215
}
213216
err = k8sClient.Status().Update(ctx, testCluster)
@@ -221,8 +224,8 @@ var _ = Describe("OpenStackCluster controller", func() {
221224
computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{})
222225

223226
err = deleteBastion(scope, capiCluster, testCluster)
224-
Expect(testCluster.Status.Bastion).To(BeNil())
225227
Expect(err).To(BeNil())
228+
Expect(testCluster.Status.Bastion).To(BeNil())
226229
})
227230
It("should adopt an existing bastion even if its uuid is not stored in status", func() {
228231
testCluster.SetName("adopt-existing-bastion")

controllers/openstackmachine_controller.go

Lines changed: 141 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -169,26 +169,21 @@ func resolveMachineResources(scope *scope.WithLogger, clusterResourceName string
169169
openStackMachine.Status.Resolved = resolved
170170
}
171171
// Resolve and store resources
172-
changed, err := compute.ResolveMachineSpec(scope,
172+
return compute.ResolveMachineSpec(scope,
173173
&openStackMachine.Spec, resolved,
174174
clusterResourceName, openStackMachine.Name,
175175
openStackCluster, getManagedSecurityGroup(openStackCluster, machine))
176-
if err != nil {
177-
return false, err
178-
}
179-
if changed {
180-
// If the resolved machine spec changed we need to start the reconcile again to prevent inconsistency between reconciles.
181-
return true, nil
182-
}
176+
}
183177

178+
func adoptMachineResources(scope *scope.WithLogger, openStackMachine *infrav1.OpenStackMachine) error {
184179
resources := openStackMachine.Status.Resources
185180
if resources == nil {
186181
resources = &infrav1.MachineResources{}
187182
openStackMachine.Status.Resources = resources
188183
}
189184

190185
// Adopt any existing resources
191-
return false, compute.AdoptMachineResources(scope, resolved, resources)
186+
return compute.AdoptMachineResources(scope, openStackMachine.Status.Resolved, resources)
192187
}
193188

194189
func patchMachine(ctx context.Context, patchHelper *patch.Helper, openStackMachine *infrav1.OpenStackMachine, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -256,66 +251,72 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
256251
return ctrl.Result{}, err
257252
}
258253

259-
// We may have resources to adopt if the cluster is ready
260-
if openStackCluster.Status.Ready && openStackCluster.Status.Network != nil {
261-
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
262-
return ctrl.Result{}, err
263-
}
254+
// Nothing to do if the cluster is not ready because no machine resources were created.
255+
if !openStackCluster.Status.Ready || openStackCluster.Status.Network == nil {
256+
// The finalizer should not have been added yet in this case,
257+
// but the following handles the upgrade case.
258+
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
259+
return ctrl.Result{}, nil
264260
}
265261

266-
if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
267-
loadBalancerService, err := loadbalancer.NewService(scope)
268-
if err != nil {
269-
return ctrl.Result{}, err
270-
}
262+
// For machines created after v0.10, or any machine which has been
263+
// reconciled at least once by v0.10 or later, status.Resolved always
264+
// exists before any resources are created. We can therefore assume
265+
// that if it does not exist, no resources were created.
266+
//
267+
// There is an upgrade edge case where a machine may have been marked
268+
// deleted before upgrade but we are completing it after upgrade. For
269+
// this use case only we make a best effort to resolve resources before
270+
// continuing, but if we get an error we log it and continue anyway.
271+
// This has the potential to leak resources, but only in this specific
272+
// edge case. The alternative is to continue retrying until it succeeds,
273+
// but that risks never deleting a machine which cannot be resolved due
274+
// to a spec error.
275+
//
276+
// This code can and should be deleted in a future release when we are
277+
// sure that all machines have been reconciled at least by a v0.10 or
278+
// later controller.
279+
if _, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); err != nil {
280+
// Return the error, but allow the resource to be removed anyway.
281+
controllerutil.RemoveFinalizer(openStackMachine, infrav1.MachineFinalizer)
282+
return ctrl.Result{}, err
283+
}
271284

272-
err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, machine, openStackMachine, clusterResourceName)
273-
if err != nil {
274-
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
275-
return ctrl.Result{}, err
276-
}
285+
// Check for any orphaned resources
286+
// N.B. Unlike resolveMachineResources, we must always look for orphaned resources in the delete path.
287+
if err := adoptMachineResources(scope, openStackMachine); err != nil {
288+
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
277289
}
278290

279-
var instanceStatus *compute.InstanceStatus
280-
if openStackMachine.Status.InstanceID != nil {
281-
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
282-
if err != nil {
283-
return ctrl.Result{}, err
284-
}
285-
} else if instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name); err != nil {
291+
instanceStatus, err := getInstanceStatus(openStackMachine, computeService)
292+
if err != nil {
286293
return ctrl.Result{}, err
287294
}
288-
if !openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() && util.IsControlPlaneMachine(machine) && openStackCluster.Spec.APIServerFloatingIP == nil {
289-
if instanceStatus != nil {
290-
instanceNS, err := instanceStatus.NetworkStatus()
291-
if err != nil {
292-
openStackMachine.SetFailure(
293-
capierrors.UpdateMachineError,
294-
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
295-
)
296-
return ctrl.Result{}, nil
297-
}
298295

299-
addresses := instanceNS.Addresses()
300-
for _, address := range addresses {
301-
if address.Type == corev1.NodeExternalIP {
302-
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
303-
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
304-
return ctrl.Result{}, fmt.Errorf("delete floating IP %q: %w", address.Address, err)
305-
}
306-
}
307-
}
296+
if util.IsControlPlaneMachine(machine) {
297+
if err := removeAPIServerEndpoint(scope, openStackCluster, openStackMachine, instanceStatus, clusterResourceName); err != nil {
298+
return ctrl.Result{}, err
308299
}
309300
}
310301

311-
instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
312-
if err != nil {
313-
return ctrl.Result{}, err
302+
// If no instance was created we currently need to check for orphaned
303+
// volumes. This requires resolving the instance spec.
304+
// TODO: write volumes to status resources on creation so this is no longer required.
305+
if instanceStatus == nil && openStackMachine.Status.Resolved != nil {
306+
instanceSpec, err := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "")
307+
if err != nil {
308+
return ctrl.Result{}, err
309+
}
310+
if err := computeService.DeleteVolumes(instanceSpec); err != nil {
311+
return ctrl.Result{}, fmt.Errorf("delete volumes: %w", err)
312+
}
314313
}
315314

316-
if err := computeService.DeleteInstance(openStackMachine, instanceStatus, instanceSpec); err != nil {
317-
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
318-
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
315+
if instanceStatus != nil {
316+
if err := computeService.DeleteInstance(openStackMachine, instanceStatus); err != nil {
317+
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err)
318+
return ctrl.Result{}, fmt.Errorf("delete instance: %w", err)
319+
}
319320
}
320321

321322
trunkSupported, err := networkingService.IsTrunkExtSupported()
@@ -341,6 +342,62 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope *scope.WithLogger, cl
341342
return ctrl.Result{}, nil
342343
}
343344

345+
func getInstanceStatus(openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service) (*compute.InstanceStatus, error) {
346+
if openStackMachine.Status.InstanceID != nil {
347+
return computeService.GetInstanceStatus(*openStackMachine.Status.InstanceID)
348+
}
349+
return computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
350+
}
351+
352+
func removeAPIServerEndpoint(scope *scope.WithLogger, openStackCluster *infrav1.OpenStackCluster, openStackMachine *infrav1.OpenStackMachine, instanceStatus *compute.InstanceStatus, clusterResourceName string) error {
353+
if openStackCluster.Spec.APIServerLoadBalancer.IsEnabled() {
354+
loadBalancerService, err := loadbalancer.NewService(scope)
355+
if err != nil {
356+
return err
357+
}
358+
359+
err = loadBalancerService.DeleteLoadBalancerMember(openStackCluster, openStackMachine, clusterResourceName)
360+
if err != nil {
361+
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.LoadBalancerMemberErrorReason, clusterv1.ConditionSeverityWarning, "Machine could not be removed from load balancer: %v", err)
362+
return err
363+
}
364+
return nil
365+
}
366+
367+
// XXX(mdbooth): This looks wrong to me. Surely we should only ever
368+
// disassociate the floating IP here. I would expect the API server
369+
// floating IP to be created and deleted with the cluster. And if the
370+
// delete behaviour is correct, we leak it if the instance was
371+
// previously deleted.
372+
if openStackCluster.Spec.APIServerFloatingIP == nil && instanceStatus != nil {
373+
instanceNS, err := instanceStatus.NetworkStatus()
374+
if err != nil {
375+
openStackMachine.SetFailure(
376+
capierrors.UpdateMachineError,
377+
fmt.Errorf("get network status for OpenStack instance %s with ID %s: %v", instanceStatus.Name(), instanceStatus.ID(), err),
378+
)
379+
return nil
380+
}
381+
382+
networkingService, err := networking.NewService(scope)
383+
if err != nil {
384+
return err
385+
}
386+
387+
addresses := instanceNS.Addresses()
388+
for _, address := range addresses {
389+
if address.Type == corev1.NodeExternalIP {
390+
if err = networkingService.DeleteFloatingIP(openStackMachine, address.Address); err != nil {
391+
conditions.MarkFalse(openStackMachine, infrav1.APIServerIngressReadyCondition, infrav1.FloatingIPErrorReason, clusterv1.ConditionSeverityError, "Deleting floating IP failed: %v", err)
392+
return fmt.Errorf("delete floating IP %q: %w", address.Address, err)
393+
}
394+
}
395+
}
396+
}
397+
398+
return nil
399+
}
400+
344401
// GetPortIDs returns a list of port IDs from a list of PortStatus.
345402
func GetPortIDs(ports []infrav1.PortStatus) []string {
346403
portIDs := make([]string, len(ports))
@@ -489,34 +546,50 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
489546
return ctrl.Result{}, nil
490547
}
491548

492-
// If the OpenStackMachine doesn't have our finalizer, add it.
493-
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
494-
// Register the finalizer immediately to avoid orphaning OpenStack resources on delete
495-
return ctrl.Result{}, nil
496-
}
497-
498549
if !openStackCluster.Status.Ready {
499550
scope.Logger().Info("Cluster infrastructure is not ready yet, re-queuing machine")
500551
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
501552
return ctrl.Result{RequeueAfter: waitForClusterInfrastructureReadyDuration}, nil
502553
}
503554

504-
if changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine); changed || err != nil {
505-
scope.Logger().V(6).Info("Machine resources updated, requeuing")
506-
return ctrl.Result{}, err
507-
}
508-
509555
// Make sure bootstrap data is available and populated.
510556
if machine.Spec.Bootstrap.DataSecretName == nil {
511557
scope.Logger().Info("Bootstrap data secret reference is not yet available")
512558
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
513559
return ctrl.Result{}, nil
514560
}
515-
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)
561+
562+
changed, err := resolveMachineResources(scope, clusterResourceName, openStackCluster, openStackMachine, machine)
516563
if err != nil {
517564
return ctrl.Result{}, err
518565
}
566+
567+
// Also add the finalizer when writing resolved resources so we can start creating resources on the next reconcile.
568+
if controllerutil.AddFinalizer(openStackMachine, infrav1.MachineFinalizer) {
569+
changed = true
570+
}
571+
572+
// We requeue if we either added the finalizer or resolved machine
573+
// resources. This means that we never create any resources unless we
574+
// have observed that the finalizer and resolved machine resources were
575+
// successfully written in a previous transaction. This in turn means
576+
// that in the delete path we can be sure that if there are no resolved
577+
// resources then no resources were created.
578+
if changed {
579+
scope.Logger().V(6).Info("Machine resources updated, requeuing")
580+
return ctrl.Result{}, nil
581+
}
582+
583+
// Check for orphaned resources previously created but not written to the status
584+
if err := adoptMachineResources(scope, openStackMachine); err != nil {
585+
return ctrl.Result{}, fmt.Errorf("adopting machine resources: %w", err)
586+
}
587+
519588
scope.Logger().Info("Reconciling Machine")
589+
userData, err := r.getBootstrapData(ctx, machine, openStackMachine)
590+
if err != nil {
591+
return ctrl.Result{}, err
592+
}
520593

521594
computeService, err := compute.NewService(scope)
522595
if err != nil {

0 commit comments

Comments
 (0)