Skip to content

Commit 584be14

Browse files
authored
Merge pull request #1532 from pwschuurman/extra-requisite-zones
Add --fallback-requisite-zones command line argument
2 parents 9b7675d + e1633ab commit 584be14

File tree

6 files changed

+261
-116
lines changed

6 files changed

+261
-116
lines changed

cmd/gce-pd-csi-driver/main.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"math/rand"
2222
"os"
2323
"runtime"
24+
"strings"
2425
"time"
2526

2627
"k8s.io/klog/v2"
@@ -67,6 +68,8 @@ var (
6768
maxConcurrentFormatAndMount = flag.Int("max-concurrent-format-and-mount", 1, "If set then format and mount operations are serialized on each node. This is stronger than max-concurrent-format as it includes fsck and other mount operations")
6869
formatAndMountTimeout = flag.Duration("format-and-mount-timeout", 1*time.Minute, "The maximum duration of a format and mount operation before another such operation will be started. Used only if --serialize-format-and-mount")
6970

71+
fallbackRequisiteZonesFlag = flag.String("fallback-requisite-zones", "", "Comma separated list of requisite zones that will be used if there are not sufficient zones present in requisite topologies when provisioning a disk")
72+
7073
version string
7174
)
7275

@@ -145,6 +148,9 @@ func handle() {
145148
// Initialize identity server
146149
identityServer := driver.NewIdentityServer(gceDriver)
147150

151+
// Initilaize requisite zones
152+
fallbackRequisiteZones := strings.Split(*fallbackRequisiteZonesFlag, ",")
153+
148154
// Initialize requirements for the controller service
149155
var controllerServer *driver.GCEControllerServer
150156
if *runControllerService {
@@ -154,7 +160,7 @@ func handle() {
154160
}
155161
initialBackoffDuration := time.Duration(*errorBackoffInitialDurationMs) * time.Millisecond
156162
maxBackoffDuration := time.Duration(*errorBackoffMaxDurationMs) * time.Millisecond
157-
controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration)
163+
controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration, fallbackRequisiteZones)
158164
} else if *cloudConfigFilePath != "" {
159165
klog.Warningf("controller service is disabled but cloud config given - it has no effect")
160166
}

pkg/gce-pd-csi-driver/controller.go

+44-23
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,15 @@ type GCEControllerServer struct {
9191
// publish/unpublish call will clear the backoff condition for a node and
9292
// disk.
9393
errorBackoff *csiErrorBackoff
94+
95+
// Requisite zones to fallback to when provisioning a disk.
96+
// If there are an insufficient number of zones available in the union
97+
// of preferred/requisite topology, this list is used instead of
98+
// the passed in requisite topology.
99+
// The main use case of this field is to support Regional Persistent Disk
100+
// provisioning in GKE Autopilot, where a GKE cluster to
101+
// be scaled down to 1 zone.
102+
fallbackRequisiteZones []string
94103
}
95104

96105
type csiErrorBackoffId string
@@ -279,7 +288,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
279288
var volKey *meta.Key
280289
switch params.ReplicationType {
281290
case replicationTypeNone:
282-
zones, err = pickZones(ctx, gceCS, req.GetAccessibilityRequirements(), 1, locationTopReq)
291+
zones, err = gceCS.pickZones(ctx, req.GetAccessibilityRequirements(), 1, locationTopReq)
283292
if err != nil {
284293
return nil, status.Errorf(codes.InvalidArgument, "CreateVolume failed to pick zones for disk: %v", err.Error())
285294
}
@@ -289,7 +298,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
289298
volKey = meta.ZonalKey(name, zones[0])
290299

291300
case replicationTypeRegionalPD:
292-
zones, err = pickZones(ctx, gceCS, req.GetAccessibilityRequirements(), 2, locationTopReq)
301+
zones, err = gceCS.pickZones(ctx, req.GetAccessibilityRequirements(), 2, locationTopReq)
293302
if err != nil {
294303
return nil, status.Errorf(codes.InvalidArgument, "CreateVolume failed to pick zones for disk: %v", err.Error())
295304
}
@@ -1566,7 +1575,7 @@ func prependZone(zone string, zones []string) []string {
15661575
return newZones
15671576
}
15681577

1569-
func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
1578+
func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements, fallbackRequisiteZones []string) ([]string, error) {
15701579
reqZones, err := getZonesFromTopology(top.GetRequisite())
15711580
if err != nil {
15721581
return nil, fmt.Errorf("could not get zones from requisite topology: %w", err)
@@ -1611,27 +1620,39 @@ func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationT
16111620

16121621
if numZones <= len(prefZones) {
16131622
return prefZones[0:numZones], nil
1614-
} else {
1615-
zones := sets.String{}
1616-
// Add all preferred zones into zones
1617-
zones.Insert(prefZones...)
1618-
remainingNumZones := numZones - len(prefZones)
1619-
// Take all of the remaining zones from requisite zones
1620-
reqSet := sets.NewString(reqZones...)
1621-
prefSet := sets.NewString(prefZones...)
1622-
remainingZones := reqSet.Difference(prefSet)
1623-
1624-
if remainingZones.Len() < remainingNumZones {
1623+
}
1624+
1625+
remainingNumZones := numZones - len(prefZones)
1626+
// Take all of the remaining zones from requisite zones
1627+
reqSet := sets.NewString(reqZones...)
1628+
prefSet := sets.NewString(prefZones...)
1629+
remainingZones := reqSet.Difference(prefSet)
1630+
1631+
if remainingZones.Len() < remainingNumZones {
1632+
fallbackSet := sets.NewString(fallbackRequisiteZones...)
1633+
remainingFallbackZones := fallbackSet.Difference(prefSet)
1634+
if remainingFallbackZones.Len() >= remainingNumZones {
1635+
remainingZones = remainingFallbackZones
1636+
} else {
16251637
return nil, fmt.Errorf("need %v zones from topology, only got %v unique zones", numZones, reqSet.Union(prefSet).Len())
16261638
}
1627-
// Add the remaining number of zones into the set
1628-
nSlice, err := pickRandAndConsecutive(remainingZones.List(), remainingNumZones)
1629-
if err != nil {
1630-
return nil, err
1631-
}
1632-
zones.Insert(nSlice...)
1633-
return zones.List(), nil
16341639
}
1640+
1641+
allZones := prefSet.Union(remainingZones).List()
1642+
sort.Strings(allZones)
1643+
var shiftIndex int
1644+
if len(prefZones) == 0 {
1645+
// Random shift the requisite zones, since there is no preferred start.
1646+
shiftIndex = rand.Intn(len(allZones))
1647+
} else {
1648+
shiftIndex = slices.Index(allZones, prefZones[0])
1649+
}
1650+
shiftedZones := append(allZones[shiftIndex:], allZones[:shiftIndex]...)
1651+
sortedShiftedReqZones := slices.Filter(nil, shiftedZones, func(v string) bool { return !prefSet.Has(v) })
1652+
zones := make([]string, 0, numZones)
1653+
zones = append(zones, prefZones...)
1654+
zones = append(zones, sortedShiftedReqZones...)
1655+
return zones[:numZones], nil
16351656
}
16361657

16371658
func getZonesFromTopology(topList []*csi.Topology) ([]string, error) {
@@ -1667,11 +1688,11 @@ func getZoneFromSegment(seg map[string]string) (string, error) {
16671688
return zone, nil
16681689
}
16691690

1670-
func pickZones(ctx context.Context, gceCS *GCEControllerServer, top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
1691+
func (gceCS *GCEControllerServer) pickZones(ctx context.Context, top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
16711692
var zones []string
16721693
var err error
16731694
if top != nil {
1674-
zones, err = pickZonesFromTopology(top, numZones, locationTopReq)
1695+
zones, err = pickZonesFromTopology(top, numZones, locationTopReq, gceCS.fallbackRequisiteZones)
16751696
if err != nil {
16761697
return nil, fmt.Errorf("failed to pick zones from topology: %w", err)
16771698
}

0 commit comments

Comments
 (0)