Skip to content

Commit d207046

Browse files
committed
Use get instead of list to retrieve servers
Under certain circumstances, when they are non-responsive cells, nova may return a partial result instead of an error. (this is controlled by list_records_by_skipping_down_cells parameter that is true by default - no error will be thrown) When it faces this issue, capo controller will try to create a new server for the machine, resulting in deleting ports of existing one. In order avoid this issue, use GET instead of LIST with server uuid stored in machine spec when it is created, and store server ID in machine spec immediately. For that purpose serviver polling that used to take place in CreateMachine has to be moved to main reconcile loop.
1 parent 141013b commit d207046

File tree

7 files changed

+139
-133
lines changed

7 files changed

+139
-133
lines changed

controllers/openstackcluster_controller.go

Lines changed: 69 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,26 @@ func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackClust
215215
return err
216216
}
217217

218-
instanceName := fmt.Sprintf("%s-bastion", cluster.Name)
219-
instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, instanceName)
220-
if err != nil {
221-
return err
218+
if openStackCluster.Status.Bastion != nil && openStackCluster.Status.Bastion.FloatingIP != "" {
219+
// Floating IP could have been created but not associated to instance, attempt to delete it from saved status first
220+
if err = networkingService.DeleteFloatingIP(openStackCluster, openStackCluster.Status.Bastion.FloatingIP); err != nil {
221+
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete floating IP: %w", err))
222+
return fmt.Errorf("failed to delete floating IP: %w", err)
223+
}
224+
}
225+
226+
var instanceStatus *compute.InstanceStatus
227+
if openStackCluster.Status.Bastion != nil && openStackCluster.Status.Bastion.ID != "" {
228+
instanceStatus, err = computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID)
229+
if err != nil {
230+
return err
231+
}
232+
} else {
233+
instanceName := fmt.Sprintf("%s-bastion", cluster.Name)
234+
instanceStatus, err = computeService.GetInstanceStatusByName(openStackCluster, instanceName)
235+
if err != nil {
236+
return err
237+
}
222238
}
223239

224240
if instanceStatus != nil {
@@ -277,8 +293,8 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu
277293
return reconcile.Result{}, err
278294
}
279295

280-
if err = reconcileBastion(scope, cluster, openStackCluster); err != nil {
281-
return reconcile.Result{}, err
296+
if result, err := reconcileBastion(scope, cluster, openStackCluster); err != nil {
297+
return result, err
282298
}
283299

284300
availabilityZones, err := computeService.GetAvailabilityZones()
@@ -308,82 +324,91 @@ func reconcileNormal(scope scope.Scope, cluster *clusterv1.Cluster, openStackClu
308324
return reconcile.Result{}, nil
309325
}
310326

311-
func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) error {
327+
func reconcileBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster) (ctrl.Result, error) {
312328
scope.Logger().Info("Reconciling Bastion")
313329

314330
if openStackCluster.Spec.Bastion == nil || !openStackCluster.Spec.Bastion.Enabled {
315-
return deleteBastion(scope, cluster, openStackCluster)
331+
return reconcile.Result{}, deleteBastion(scope, cluster, openStackCluster)
316332
}
317333

318334
computeService, err := compute.NewService(scope)
319335
if err != nil {
320-
return err
336+
return reconcile.Result{}, err
321337
}
322338

323339
instanceSpec := bastionToInstanceSpec(openStackCluster, cluster.Name)
324340
bastionHash, err := compute.HashInstanceSpec(instanceSpec)
325341
if err != nil {
326-
return fmt.Errorf("failed computing bastion hash from instance spec: %w", err)
342+
return reconcile.Result{}, fmt.Errorf("failed computing bastion hash from instance spec: %w", err)
343+
} else if bastionHashHasChanged(bastionHash, openStackCluster.ObjectMeta.Annotations) {
344+
if err := deleteBastion(scope, cluster, openStackCluster); err != nil {
345+
return ctrl.Result{}, err
346+
}
327347
}
328348

329-
instanceStatus, err := computeService.GetInstanceStatusByName(openStackCluster, fmt.Sprintf("%s-bastion", cluster.Name))
330-
if err != nil {
331-
return err
349+
var instanceStatus *compute.InstanceStatus
350+
if openStackCluster.Status.Bastion != nil && openStackCluster.Status.Bastion.ID != "" {
351+
if instanceStatus, err = computeService.GetInstanceStatus(openStackCluster.Status.Bastion.ID); err != nil {
352+
return reconcile.Result{}, err
353+
}
332354
}
333-
if instanceStatus != nil {
334-
if !bastionHashHasChanged(bastionHash, openStackCluster.ObjectMeta.Annotations) {
335-
bastion, err := instanceStatus.BastionStatus(openStackCluster)
355+
if instanceStatus == nil {
356+
// First check if there is an existing instance with bastion name, in case where bastion ID would not have been properly stored in cluster status
357+
if instanceStatus, err = computeService.GetInstanceStatusByName(openStackCluster, instanceSpec.Name); err != nil {
358+
return reconcile.Result{}, err
359+
}
360+
if instanceStatus == nil {
361+
instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name)
336362
if err != nil {
337-
return err
338-
}
339-
// Add the current hash if no annotation is set.
340-
if _, ok := openStackCluster.ObjectMeta.Annotations[BastionInstanceHashAnnotation]; !ok {
341-
annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash})
363+
return reconcile.Result{}, fmt.Errorf("failed to create bastion: %w", err)
342364
}
343-
openStackCluster.Status.Bastion = bastion
344-
return nil
345-
}
346-
347-
if err := deleteBastion(scope, cluster, openStackCluster); err != nil {
348-
return err
349365
}
350366
}
367+
// Save hash & status as soon as we know we have an instance
368+
instanceStatus.UpdateBastionStatus(openStackCluster)
369+
annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash})
351370

352-
instanceStatus, err = computeService.CreateInstance(openStackCluster, openStackCluster, instanceSpec, cluster.Name)
353-
if err != nil {
354-
return fmt.Errorf("failed to reconcile bastion: %w", err)
371+
// Make sure that bastion instance has a valid state
372+
switch instanceStatus.State() {
373+
case infrav1.InstanceStateError:
374+
return ctrl.Result{}, fmt.Errorf("failed to reconcile bastion, instance state is ERROR")
375+
case infrav1.InstanceStateBuilding:
376+
scope.Logger().Info("Waiting for bastion instance to become ACTIVE", "id", instanceStatus.ID(), "status", instanceStatus.State())
377+
return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil
378+
case infrav1.InstanceStateDeleted:
379+
// This should normally be handled by deleteBastion
380+
openStackCluster.Status.Bastion = nil
381+
return ctrl.Result{}, nil
355382
}
356383

357384
networkingService, err := networking.NewService(scope)
358385
if err != nil {
359-
return err
386+
return ctrl.Result{}, err
360387
}
361388
clusterName := fmt.Sprintf("%s-%s", cluster.Namespace, cluster.Name)
362-
fp, err := networkingService.GetOrCreateFloatingIP(openStackCluster, openStackCluster, clusterName, openStackCluster.Spec.Bastion.FloatingIP)
389+
floatingIP := openStackCluster.Spec.Bastion.FloatingIP
390+
if openStackCluster.Status.Bastion.FloatingIP != "" {
391+
// Some floating IP has already been created for this bastion, make sure we re-use it
392+
floatingIP = openStackCluster.Status.Bastion.FloatingIP
393+
}
394+
fp, err := networkingService.GetOrCreateFloatingIP(openStackCluster, openStackCluster, clusterName, floatingIP)
363395
if err != nil {
364396
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to get or create floating IP for bastion: %w", err))
365-
return fmt.Errorf("failed to get or create floating IP for bastion: %w", err)
397+
return ctrl.Result{}, fmt.Errorf("failed to get or create floating IP for bastion: %w", err)
366398
}
399+
openStackCluster.Status.Bastion.FloatingIP = fp.FloatingIP
367400
port, err := computeService.GetManagementPort(openStackCluster, instanceStatus)
368401
if err != nil {
369402
err = fmt.Errorf("getting management port for bastion: %w", err)
370403
handleUpdateOSCError(openStackCluster, err)
371-
return err
404+
return ctrl.Result{}, err
372405
}
373406
err = networkingService.AssociateFloatingIP(openStackCluster, fp, port.ID)
374407
if err != nil {
375408
handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to associate floating IP with bastion: %w", err))
376-
return fmt.Errorf("failed to associate floating IP with bastion: %w", err)
377-
}
378-
379-
bastion, err := instanceStatus.BastionStatus(openStackCluster)
380-
if err != nil {
381-
return err
409+
return ctrl.Result{}, fmt.Errorf("failed to associate floating IP with bastion: %w", err)
382410
}
383-
bastion.FloatingIP = fp.FloatingIP
384-
openStackCluster.Status.Bastion = bastion
385-
annotations.AddAnnotations(openStackCluster, map[string]string{BastionInstanceHashAnnotation: bastionHash})
386-
return nil
411+
return ctrl.Result{}, nil
387412
}
388413

389414
func bastionToInstanceSpec(openStackCluster *infrav1.OpenStackCluster, clusterName string) *compute.InstanceSpec {

controllers/openstackcluster_controller_test.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222

2323
"github.com/go-logr/logr"
2424
"github.com/golang/mock/gomock"
25-
"github.com/gophercloud/gophercloud/openstack/compute/v2/servers"
25+
"github.com/gophercloud/gophercloud"
2626
"github.com/gophercloud/gophercloud/openstack/networking/v2/extensions/security/groups"
2727
"github.com/gophercloud/gophercloud/openstack/networking/v2/networks"
2828
"github.com/gophercloud/gophercloud/openstack/networking/v2/subnets"
@@ -38,7 +38,6 @@ import (
3838
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3939

4040
infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha7"
41-
"sigs.k8s.io/cluster-api-provider-openstack/pkg/clients"
4241
"sigs.k8s.io/cluster-api-provider-openstack/pkg/scope"
4342
)
4443

@@ -195,18 +194,24 @@ var _ = Describe("OpenStackCluster controller", func() {
195194
Expect(err).To(BeNil())
196195
err = k8sClient.Create(ctx, capiCluster)
197196
Expect(err).To(BeNil())
197+
testCluster.Status = infrav1.OpenStackClusterStatus{
198+
Bastion: &infrav1.BastionStatus{
199+
ID: "bastion-uuid",
200+
},
201+
}
202+
err = k8sClient.Status().Update(ctx, testCluster)
203+
Expect(err).To(BeNil())
198204
scope, err := mockScopeFactory.NewClientScopeFromCluster(ctx, k8sClient, testCluster, nil, logr.Discard())
199205
Expect(err).To(BeNil())
200206

201207
computeClientRecorder := mockScopeFactory.ComputeClient.EXPECT()
202-
computeClientRecorder.ListServers(servers.ListOpts{
203-
Name: "^capi-cluster-bastion$",
204-
}).Return([]clients.ServerExt{}, nil)
208+
computeClientRecorder.GetServer("bastion-uuid").Return(nil, gophercloud.ErrResourceNotFound{})
205209

206210
networkClientRecorder := mockScopeFactory.NetworkClient.EXPECT()
207211
networkClientRecorder.ListSecGroup(gomock.Any()).Return([]groups.SecGroup{}, nil)
208212

209213
err = deleteBastion(scope, capiCluster, testCluster)
214+
Expect(testCluster.Status.Bastion).To(BeNil())
210215
Expect(err).To(BeNil())
211216
})
212217
It("should implicitly filter cluster subnets by cluster network", func() {

controllers/openstackmachine_controller.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ type OpenStackMachineReconciler struct {
6767
const (
6868
waitForClusterInfrastructureReadyDuration = 15 * time.Second
6969
waitForInstanceBecomeActiveToReconcile = 60 * time.Second
70+
waitForBuildingInstanceToReconcile = 10 * time.Second
7071
)
7172

7273
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=openstackmachines,verbs=get;list;watch;create;update;patch;delete
@@ -246,7 +247,10 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope scope.Scope, cluster
246247
}
247248
}
248249

249-
instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
250+
var instanceStatus *compute.InstanceStatus
251+
if openStackMachine.Spec.InstanceID != nil {
252+
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Spec.InstanceID)
253+
}
250254
if err != nil {
251255
return ctrl.Result{}, err
252256
}
@@ -379,6 +383,9 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
379383
scope.Logger().Info("Machine instance state is DELETED, no actions")
380384
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeletedReason, clusterv1.ConditionSeverityError, "")
381385
return ctrl.Result{}, nil
386+
case infrav1.InstanceStateBuilding:
387+
scope.Logger().Info("Waiting for instance to become ACTIVE", "id", instanceStatus.ID(), "status", instanceStatus.State())
388+
return ctrl.Result{RequeueAfter: waitForBuildingInstanceToReconcile}, nil
382389
default:
383390
// The other state is normal (for example, migrating, shutoff) but we don't want to proceed until it's ACTIVE
384391
// due to potential conflict or unexpected actions
@@ -431,14 +438,24 @@ func (r *OpenStackMachineReconciler) reconcileNormal(ctx context.Context, scope
431438
}
432439

433440
func (r *OpenStackMachineReconciler) getOrCreate(logger logr.Logger, cluster *clusterv1.Cluster, openStackCluster *infrav1.OpenStackCluster, machine *clusterv1.Machine, openStackMachine *infrav1.OpenStackMachine, computeService *compute.Service, userData string) (*compute.InstanceStatus, error) {
434-
instanceStatus, err := computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name)
435-
if err != nil {
436-
logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name)
437-
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error())
438-
return nil, err
441+
var instanceStatus *compute.InstanceStatus
442+
var err error
443+
if openStackMachine.Spec.InstanceID != nil {
444+
instanceStatus, err = computeService.GetInstanceStatus(*openStackMachine.Spec.InstanceID)
445+
if err != nil {
446+
logger.Info("Unable to get OpenStack instance", "name", openStackMachine.Name)
447+
conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.OpenStackErrorReason, clusterv1.ConditionSeverityError, err.Error())
448+
return nil, err
449+
}
439450
}
440451

441452
if instanceStatus == nil {
453+
// First check if there is an existing instance with machine name, in case where instance ID would not have been stored in machine status
454+
if instanceStatus, err = computeService.GetInstanceStatusByName(openStackMachine, openStackMachine.Name); err == nil {
455+
if instanceStatus != nil {
456+
return instanceStatus, nil
457+
}
458+
}
442459
instanceSpec := machineToInstanceSpec(openStackCluster, machine, openStackMachine, userData)
443460
logger.Info("Machine does not exist, creating Machine", "name", openStackMachine.Name)
444461
instanceStatus, err = computeService.CreateInstance(openStackMachine, openStackCluster, instanceSpec, cluster.Name)

controllers/suite_test.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,14 @@ import (
3232
"k8s.io/apimachinery/pkg/types"
3333
"k8s.io/client-go/kubernetes/scheme"
3434
"k8s.io/client-go/rest"
35+
"k8s.io/utils/pointer"
3536
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3637
"sigs.k8s.io/cluster-api/test/framework"
3738
"sigs.k8s.io/controller-runtime/pkg/client"
3839
"sigs.k8s.io/controller-runtime/pkg/envtest"
3940

4041
infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha7"
42+
"sigs.k8s.io/cluster-api-provider-openstack/pkg/clients"
4143
"sigs.k8s.io/cluster-api-provider-openstack/pkg/cloud/services/compute"
4244
"sigs.k8s.io/cluster-api-provider-openstack/pkg/scope"
4345
"sigs.k8s.io/cluster-api-provider-openstack/test/helpers/external"
@@ -148,9 +150,13 @@ var _ = Describe("When calling getOrCreate", func() {
148150
cluster := &clusterv1.Cluster{}
149151
openStackCluster := &infrav1.OpenStackCluster{}
150152
machine := &clusterv1.Machine{}
151-
openStackMachine := &infrav1.OpenStackMachine{}
153+
openStackMachine := &infrav1.OpenStackMachine{
154+
Spec: infrav1.OpenStackMachineSpec{
155+
InstanceID: pointer.String("machine-uuid"),
156+
},
157+
}
152158

153-
mockScopeFactory.ComputeClient.EXPECT().ListServers(gomock.Any()).Return(nil, errors.New("Test error when listing servers"))
159+
mockScopeFactory.ComputeClient.EXPECT().GetServer(gomock.Any()).Return(nil, errors.New("Test error when getting server"))
154160
instanceStatus, err := reconsiler.getOrCreate(logger, cluster, openStackCluster, machine, openStackMachine, computeService, "")
155161
Expect(err).To(HaveOccurred())
156162
Expect(instanceStatus).To(BeNil())
@@ -163,4 +169,19 @@ var _ = Describe("When calling getOrCreate", func() {
163169
}
164170
}
165171
})
172+
173+
It("should retrieve instance by name if no ID is stored", func() {
174+
cluster := &clusterv1.Cluster{}
175+
openStackCluster := &infrav1.OpenStackCluster{}
176+
machine := &clusterv1.Machine{}
177+
openStackMachine := &infrav1.OpenStackMachine{}
178+
servers := make([]clients.ServerExt, 1)
179+
servers[0].ID = "machine-uuid"
180+
181+
mockScopeFactory.ComputeClient.EXPECT().ListServers(gomock.Any()).Return(servers, nil)
182+
instanceStatus, err := reconsiler.getOrCreate(logger, cluster, openStackCluster, machine, openStackMachine, computeService, "")
183+
Expect(err).ToNot(HaveOccurred())
184+
Expect(instanceStatus).ToNot(BeNil())
185+
Expect(instanceStatus.ID()).To(Equal("machine-uuid"))
186+
})
166187
})

pkg/cloud/services/compute/instance.go

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -333,27 +333,8 @@ func (s *Service) createInstanceImpl(eventObject runtime.Object, openStackCluste
333333
return nil, fmt.Errorf("error creating Openstack instance: %v", err)
334334
}
335335

336-
var createdInstance *InstanceStatus
337-
err = wait.PollUntilContextTimeout(context.TODO(), retryInterval, instanceCreateTimeout, true, func(_ context.Context) (bool, error) {
338-
createdInstance, err = s.GetInstanceStatus(server.ID)
339-
if err != nil {
340-
if capoerrors.IsRetryable(err) {
341-
return false, nil
342-
}
343-
return false, err
344-
}
345-
if createdInstance.State() == infrav1.InstanceStateError {
346-
return false, fmt.Errorf("error creating OpenStack instance %s, status changed to error", createdInstance.ID())
347-
}
348-
return createdInstance.State() == infrav1.InstanceStateActive, nil
349-
})
350-
if err != nil {
351-
record.Warnf(eventObject, "FailedCreateServer", "Failed to create server %s: %v", createdInstance.Name(), err)
352-
return nil, err
353-
}
354-
355-
record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", createdInstance.Name(), createdInstance.ID())
356-
return createdInstance, nil
336+
record.Eventf(eventObject, "SuccessfulCreateServer", "Created server %s with id %s", server.Name, server.ID)
337+
return &InstanceStatus{server, s.scope.Logger()}, nil
357338
}
358339

359340
func volumeName(instanceName string, nameSuffix string) string {

0 commit comments

Comments
 (0)