Skip to content

Commit 7cda00b

Browse files
authored
Merge pull request #2958 from sedefsavas/kcp_upgrade_fix
🐛 KCP should check if it has machines being deleted before proceeding with scale up/down
2 parents ef72ed6 + 8bbdd95 commit 7cda00b

File tree

5 files changed

+60
-17
lines changed

5 files changed

+60
-17
lines changed

Diff for: controllers/machine_controller.go

-4
Original file line numberDiff line numberDiff line change
@@ -322,10 +322,6 @@ func (r *MachineReconciler) isDeleteNodeAllowed(ctx context.Context, cluster *cl
322322
// Do not delete the NodeRef if there are no remaining members of
323323
// the control plane.
324324
return errNoControlPlaneNodes
325-
case numControlPlaneMachines == 1 && util.IsControlPlaneMachine(machine):
326-
// Do not delete the NodeRef if this is the last member of the
327-
// control plane.
328-
return errLastControlPlaneNode
329325
default:
330326
// Otherwise it is okay to delete the NodeRef.
331327
return nil

Diff for: controllers/machine_controller_test.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controllers
1919
import (
2020
"context"
2121
"testing"
22+
"time"
2223

2324
. "github.com/onsi/gomega"
2425

@@ -836,7 +837,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) {
836837
expectedError: errNoControlPlaneNodes,
837838
},
838839
{
839-
name: "is last control plane members",
840+
name: "is last control plane member",
840841
cluster: &clusterv1.Cluster{},
841842
machine: &clusterv1.Machine{
842843
ObjectMeta: metav1.ObjectMeta{
@@ -846,7 +847,8 @@ func TestIsDeleteNodeAllowed(t *testing.T) {
846847
clusterv1.ClusterLabelName: "test",
847848
clusterv1.MachineControlPlaneLabelName: "",
848849
},
849-
Finalizers: []string{clusterv1.MachineFinalizer, metav1.FinalizerDeleteDependents},
850+
Finalizers: []string{clusterv1.MachineFinalizer, metav1.FinalizerDeleteDependents},
851+
DeletionTimestamp: &metav1.Time{Time: time.Now().UTC()},
850852
},
851853
Spec: clusterv1.MachineSpec{
852854
ClusterName: "test-cluster",
@@ -859,7 +861,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) {
859861
},
860862
},
861863
},
862-
expectedError: errLastControlPlaneNode,
864+
expectedError: errNoControlPlaneNodes,
863865
},
864866
{
865867
name: "has nodeRef and control plane is healthy",

Diff for: controlplane/kubeadm/controllers/controller.go

+7
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
332332

333333
// reconcileHealth performs health checks for control plane components and etcd
334334
// It removes any etcd members that do not have a corresponding node.
335+
// Also, as a final step, checks if there is any machines that is being deleted.
335336
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) error {
336337
logger := controlPlane.Logger()
337338

@@ -361,5 +362,11 @@ func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, clu
361362
return &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
362363
}
363364

365+
// We need this check for scale up as well as down to avoid scaling up when there is a machine being deleted.
366+
// This should be at the end of this method as no need to wait for machine to be completely deleted to reconcile etcd.
367+
// TODO: Revisit during machine remediation implementation which may need to cover other machine phases.
368+
if controlPlane.HasDeletingMachine() {
369+
return &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter}
370+
}
364371
return nil
365372
}

Diff for: controlplane/kubeadm/controllers/scale.go

+5-9
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
4848
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
4949
logger := controlPlane.Logger()
5050

51+
// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
5152
if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
5253
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
5354
}
@@ -73,21 +74,16 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
7374
) (ctrl.Result, error) {
7475
logger := controlPlane.Logger()
7576

77+
if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
78+
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
79+
}
80+
7681
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
7782
if err != nil {
7883
logger.Error(err, "Failed to create client to workload cluster")
7984
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
8085
}
8186

82-
// Wait for any delete in progress to complete before deleting another Machine
83-
if controlPlane.HasDeletingMachine() {
84-
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: deleteRequeueAfter}
85-
}
86-
87-
if err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil {
88-
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: healthCheckFailedRequeueAfter}
89-
}
90-
9187
machineToDelete, err := selectMachineForScaleDown(controlPlane)
9288
if err != nil {
9389
return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")

Diff for: test/e2e/kcp_upgrade.go

+43-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,49 @@ func KCPUpgradeSpec(ctx context.Context, inputGetter func() KCPUpgradeSpecInput)
6666
namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
6767
})
6868

69-
It("Should successfully upgrade Kubernetes, DNS, kube-proxy, and etcd", func() {
69+
It("Should successfully upgrade Kubernetes, DNS, kube-proxy, and etcd in a single control plane cluster", func() {
70+
71+
By("Creating a workload cluster")
72+
Expect(input.E2EConfig.Variables).To(HaveKey(clusterctl.KubernetesVersion))
73+
Expect(input.E2EConfig.Variables).To(HaveKey(clusterctl.CNIPath))
74+
75+
cluster, controlPlane, _ = clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
76+
ClusterProxy: input.BootstrapClusterProxy,
77+
ConfigCluster: clusterctl.ConfigClusterInput{
78+
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", input.BootstrapClusterProxy.GetName()),
79+
ClusterctlConfigPath: input.ClusterctlConfigPath,
80+
KubeconfigPath: input.BootstrapClusterProxy.GetKubeconfigPath(),
81+
InfrastructureProvider: clusterctl.DefaultInfrastructureProvider,
82+
Flavor: clusterctl.DefaultFlavor,
83+
Namespace: namespace.Name,
84+
ClusterName: fmt.Sprintf("cluster-%s", util.RandomString(6)),
85+
KubernetesVersion: input.E2EConfig.GetKubernetesVersion(),
86+
ControlPlaneMachineCount: pointer.Int64Ptr(1),
87+
WorkerMachineCount: pointer.Int64Ptr(1),
88+
},
89+
CNIManifestPath: input.E2EConfig.GetCNIPath(),
90+
WaitForClusterIntervals: input.E2EConfig.GetIntervals(specName, "wait-cluster"),
91+
WaitForControlPlaneIntervals: input.E2EConfig.GetIntervals(specName, "wait-control-plane"),
92+
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
93+
})
94+
95+
By("Upgrading Kubernetes, DNS, kube-proxy, and etcd versions")
96+
framework.UpgradeControlPlaneAndWaitForUpgrade(ctx, framework.UpgradeControlPlaneAndWaitForUpgradeInput{
97+
ClusterProxy: input.BootstrapClusterProxy,
98+
Cluster: cluster,
99+
ControlPlane: controlPlane,
100+
//Valid image tags for v1.17.2
101+
EtcdImageTag: "3.4.3-0",
102+
DNSImageTag: "1.6.6",
103+
KubernetesUpgradeVersion: "v1.17.2",
104+
WaitForMachinesToBeUpgraded: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
105+
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
106+
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
107+
})
108+
109+
By("PASSED!")
110+
})
111+
It("Should successfully upgrade Kubernetes, DNS, kube-proxy, and etcd in a HA cluster", func() {
70112

71113
By("Creating a workload cluster")
72114
Expect(input.E2EConfig.Variables).To(HaveKey(clusterctl.KubernetesVersion))

0 commit comments

Comments
 (0)