@@ -30,8 +30,9 @@ import (
30
30
)
31
31
32
32
type failureDomainAggregation struct {
33
- id string
34
- count int
33
+ id string
34
+ countPriority int
35
+ countAll int
35
36
}
36
37
type failureDomainAggregations []failureDomainAggregation
37
38
@@ -43,67 +44,87 @@ func (f failureDomainAggregations) Len() int {
43
44
// Less reports whether the element with
44
45
// index i should sort before the element with index j.
45
46
func (f failureDomainAggregations ) Less (i , j int ) bool {
46
- return f [i ].count < f [j ].count
47
+ // If a failure domain has less priority machines then the other, it goes first
48
+ if f [i ].countPriority < f [j ].countPriority {
49
+ return true
50
+ }
51
+ if f [i ].countPriority > f [j ].countPriority {
52
+ return false
53
+ }
54
+
55
+ // If a failure domain has the same number of priority machines then the other,
56
+ // use the number of overall machines to pick which one goes first.
57
+ if f [i ].countAll < f [j ].countAll {
58
+ return true
59
+ }
60
+ if f [i ].countAll > f [j ].countAll {
61
+ return false
62
+ }
63
+
64
+ // If both failure domain have the same number of priority machines and overall machines, we keep the order
65
+ // in the list which ensure a certain degree of randomness because the list originates from a map.
66
+ // This helps to spread machines e.g. when concurrently working on many clusters.
67
+ return i < j
47
68
}
48
69
49
70
// Swap swaps the elements with indexes i and j.
50
71
func (f failureDomainAggregations ) Swap (i , j int ) {
51
72
f [i ], f [j ] = f [j ], f [i ]
52
73
}
53
74
54
- // PickMost returns a failure domain that is in machines and has most of the group of machines on.
55
- func PickMost (ctx context.Context , failureDomains clusterv1.FailureDomains , groupMachines , machines collections.Machines ) * string {
56
- // orderDescending sorts failure domains according to all machines belonging to the group.
57
- fds := orderDescending (ctx , failureDomains , groupMachines )
58
- for _ , fd := range fds {
59
- for _ , m := range machines {
60
- if m .Spec .FailureDomain == nil {
61
- continue
62
- }
63
- if * m .Spec .FailureDomain == fd .id {
64
- return & fd .id
65
- }
66
- }
67
- }
68
- return nil
69
- }
70
-
71
- // orderDescending returns the sorted failure domains in decreasing order.
72
- func orderDescending (ctx context.Context , failureDomains clusterv1.FailureDomains , machines collections.Machines ) failureDomainAggregations {
73
- aggregations := pick (ctx , failureDomains , machines )
75
+ // PickMost returns the failure domain from which we have to delete a control plane machine, which is the failure domain with most machines and at least one eligible machine in it.
76
+ func PickMost (ctx context.Context , failureDomains clusterv1.FailureDomains , allMachines , eligibleMachines collections.Machines ) * string {
77
+ aggregations := countByFailureDomain (ctx , failureDomains , allMachines , eligibleMachines )
74
78
if len (aggregations ) == 0 {
75
79
return nil
76
80
}
77
81
sort .Sort (sort .Reverse (aggregations ))
78
- return aggregations
82
+ if len (aggregations ) > 0 && aggregations [0 ].countPriority > 0 {
83
+ return ptr .To (aggregations [0 ].id )
84
+ }
85
+ return nil
79
86
}
80
87
81
- // PickFewest returns the failure domain with the fewest number of machines.
82
- func PickFewest (ctx context.Context , failureDomains clusterv1.FailureDomains , machines collections.Machines ) * string {
83
- aggregations := pick (ctx , failureDomains , machines )
88
+ // PickFewest returns the failure domain that will be used for placement of a new control plane machine, which is the failure domain with the fewest
89
+ // number of up-to-date, not deleted machines.
90
+ //
91
+ // Ensuring proper spreading of up-to-date, not deleted machines, is the highest priority to achieve ideal spreading of machines
92
+ // at stable state/when only up-to-date machines will exist.
93
+ //
94
+ // In case of tie (more failure domain with the same number of up-to-date, not deleted machines) the failure domain with the fewest number of
95
+ // machine overall is picked to ensure a better spreading of machines while the rollout is performed.
96
+ func PickFewest (ctx context.Context , failureDomains clusterv1.FailureDomains , allMachines , upToDateMachines collections.Machines ) * string {
97
+ aggregations := countByFailureDomain (ctx , failureDomains , allMachines , upToDateMachines )
84
98
if len (aggregations ) == 0 {
85
99
return nil
86
100
}
87
101
sort .Sort (aggregations )
88
102
return ptr .To (aggregations [0 ].id )
89
103
}
90
104
91
- func pick (ctx context.Context , failureDomains clusterv1.FailureDomains , machines collections.Machines ) failureDomainAggregations {
105
+ // countByFailureDomain returns failure domains with the number of machines in it.
106
+ // Note: countByFailureDomain computes both the number of machines as well as the number of a subset of machines with higher priority.
107
+ // E.g. for deletion out of date machines have higher priority vs other machines.
108
+ func countByFailureDomain (ctx context.Context , failureDomains clusterv1.FailureDomains , allMachines , priorityMachines collections.Machines ) failureDomainAggregations {
92
109
log := ctrl .LoggerFrom (ctx )
93
110
94
111
if len (failureDomains ) == 0 {
95
112
return failureDomainAggregations {}
96
113
}
97
114
98
- counters := map [string ]int {}
115
+ counters := map [string ]failureDomainAggregation {}
99
116
100
117
// Initialize the known failure domain keys to find out if an existing machine is in an unsupported failure domain.
101
- for fd := range failureDomains {
102
- counters [fd ] = 0
118
+ for id := range failureDomains {
119
+ counters [id ] = failureDomainAggregation {
120
+ id : id ,
121
+ countPriority : 0 ,
122
+ countAll : 0 ,
123
+ }
103
124
}
104
125
105
126
// Count how many machines are in each failure domain.
106
- for _ , m := range machines {
127
+ for _ , m := range allMachines {
107
128
if m .Spec .FailureDomain == nil {
108
129
continue
109
130
}
@@ -116,15 +137,30 @@ func pick(ctx context.Context, failureDomains clusterv1.FailureDomains, machines
116
137
log .Info (fmt .Sprintf ("Unknown failure domain %q for Machine %s (known failure domains: %v)" , id , m .GetName (), knownFailureDomains ))
117
138
continue
118
139
}
119
- counters [id ]++
140
+ a := counters [id ]
141
+ a .countAll ++
142
+ counters [id ] = a
120
143
}
121
144
122
- aggregations := make (failureDomainAggregations , 0 )
123
-
124
- // Gather up tuples of failure domains ids and counts
125
- for fd , count := range counters {
126
- aggregations = append (aggregations , failureDomainAggregation {id : fd , count : count })
145
+ for _ , m := range priorityMachines {
146
+ if m .Spec .FailureDomain == nil {
147
+ continue
148
+ }
149
+ id := * m .Spec .FailureDomain
150
+ if _ , ok := failureDomains [id ]; ! ok {
151
+ continue
152
+ }
153
+ a := counters [id ]
154
+ a .countPriority ++
155
+ counters [id ] = a
127
156
}
128
157
158
+ // Collect failure domain aggregations.
159
+ // Note: by creating the list from a map, we get a certain degree of randomness that helps to spread machines
160
+ // e.g. when concurrently working on many clusters.
161
+ aggregations := make (failureDomainAggregations , 0 )
162
+ for _ , count := range counters {
163
+ aggregations = append (aggregations , count )
164
+ }
129
165
return aggregations
130
166
}
0 commit comments