Skip to content

Commit 617666c

Browse files
committed
Add --fallback-requisite-zones command line argument
1 parent 1a1f846 commit 617666c

File tree

6 files changed

+218
-115
lines changed

6 files changed

+218
-115
lines changed

cmd/gce-pd-csi-driver/main.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"math/rand"
2222
"os"
2323
"runtime"
24+
"strings"
2425
"time"
2526

2627
"k8s.io/klog/v2"
@@ -66,6 +67,8 @@ var (
6667
maxConcurrentFormatAndMount = flag.Int("max-concurrent-format-and-mount", 1, "If set then format and mount operations are serialized on each node. This is stronger than max-concurrent-format as it includes fsck and other mount operations")
6768
formatAndMountTimeout = flag.Duration("format-and-mount-timeout", 1*time.Minute, "The maximum duration of a format and mount operation before another such operation will be started. Used only if --serialize-format-and-mount")
6869

70+
fallbackRequisiteZonesFlag = flag.String("fallback-requisite-zones", "", "Comma separated list of requisite zones that will be used if there are not sufficient zones present in requisite topologies when provisioning a disk")
71+
6972
version string
7073
)
7174

@@ -128,6 +131,9 @@ func handle() {
128131
// Initialize identity server
129132
identityServer := driver.NewIdentityServer(gceDriver)
130133

134+
// Initilaize requisite zones
135+
fallbackRequisiteZones := strings.Split(*fallbackRequisiteZonesFlag, ",")
136+
131137
// Initialize requirements for the controller service
132138
var controllerServer *driver.GCEControllerServer
133139
if *runControllerService {
@@ -137,7 +143,7 @@ func handle() {
137143
}
138144
initialBackoffDuration := time.Duration(*errorBackoffInitialDurationMs) * time.Millisecond
139145
maxBackoffDuration := time.Duration(*errorBackoffMaxDurationMs) * time.Millisecond
140-
controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration)
146+
controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration, fallbackRequisiteZones)
141147
} else if *cloudConfigFilePath != "" {
142148
klog.Warningf("controller service is disabled but cloud config given - it has no effect")
143149
}

pkg/gce-pd-csi-driver/controller.go

+43-24
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,15 @@ type GCEControllerServer struct {
9191
// publish/unpublish call will clear the backoff condition for a node and
9292
// disk.
9393
errorBackoff *csiErrorBackoff
94+
95+
// Requisite zones to fallback to when provisioning a disk.
96+
// If there are an insufficient number of zones available in the union
97+
// of preferred/requisite topology, this list is used instead of
98+
// the passed in requisite topology.
99+
// The main use case of this field is to support Regional Persistent Disk
100+
// provisioning in GKE Autopilot, where a GKE cluster to
101+
// be scaled down to 1 zone.
102+
fallbackRequisiteZones []string
94103
}
95104

96105
type csiErrorBackoffId string
@@ -272,7 +281,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
272281
var volKey *meta.Key
273282
switch params.ReplicationType {
274283
case replicationTypeNone:
275-
zones, err = pickZones(ctx, gceCS, req.GetAccessibilityRequirements(), 1, locationTopReq)
284+
zones, err = gceCS.pickZones(ctx, req.GetAccessibilityRequirements(), 1, locationTopReq)
276285
if err != nil {
277286
return nil, status.Errorf(codes.InvalidArgument, "CreateVolume failed to pick zones for disk: %v", err.Error())
278287
}
@@ -282,7 +291,7 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
282291
volKey = meta.ZonalKey(name, zones[0])
283292

284293
case replicationTypeRegionalPD:
285-
zones, err = pickZones(ctx, gceCS, req.GetAccessibilityRequirements(), 2, locationTopReq)
294+
zones, err = gceCS.pickZones(ctx, req.GetAccessibilityRequirements(), 2, locationTopReq)
286295
if err != nil {
287296
return nil, status.Errorf(codes.InvalidArgument, "CreateVolume failed to pick zones for disk: %v", err.Error())
288297
}
@@ -1551,7 +1560,7 @@ func prependZone(zone string, zones []string) []string {
15511560
return newZones
15521561
}
15531562

1554-
func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
1563+
func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements, fallbackRequisiteZones []string) ([]string, error) {
15551564
reqZones, err := getZonesFromTopology(top.GetRequisite())
15561565
if err != nil {
15571566
return nil, fmt.Errorf("could not get zones from requisite topology: %w", err)
@@ -1596,27 +1605,37 @@ func pickZonesFromTopology(top *csi.TopologyRequirement, numZones int, locationT
15961605

15971606
if numZones <= len(prefZones) {
15981607
return prefZones[0:numZones], nil
1599-
} else {
1600-
zones := sets.String{}
1601-
// Add all preferred zones into zones
1602-
zones.Insert(prefZones...)
1603-
remainingNumZones := numZones - len(prefZones)
1604-
// Take all of the remaining zones from requisite zones
1605-
reqSet := sets.NewString(reqZones...)
1606-
prefSet := sets.NewString(prefZones...)
1607-
remainingZones := reqSet.Difference(prefSet)
1608-
1609-
if remainingZones.Len() < remainingNumZones {
1610-
return nil, fmt.Errorf("need %v zones from topology, only got %v unique zones", numZones, reqSet.Union(prefSet).Len())
1611-
}
1612-
// Add the remaining number of zones into the set
1613-
nSlice, err := pickRandAndConsecutive(remainingZones.List(), remainingNumZones)
1614-
if err != nil {
1615-
return nil, err
1608+
}
1609+
1610+
remainingNumZones := numZones - len(prefZones)
1611+
// Take all of the remaining zones from requisite zones
1612+
reqSet := sets.NewString(reqZones...)
1613+
prefSet := sets.NewString(prefZones...)
1614+
remainingZones := reqSet.Difference(prefSet)
1615+
1616+
fallbackSet := sets.NewString(fallbackRequisiteZones...)
1617+
remainingFallbackZones := fallbackSet.Difference(prefSet)
1618+
if remainingZones.Len() < remainingNumZones {
1619+
if remainingFallbackZones.Len() >= remainingNumZones {
1620+
remainingZones = remainingFallbackZones
1621+
} else {
1622+
return nil, fmt.Errorf("need %v zones from topology, only got %v unique zones", numZones, remainingZones.Union(prefSet).Len())
16161623
}
1617-
zones.Insert(nSlice...)
1618-
return zones.List(), nil
16191624
}
1625+
1626+
allZones := prefSet.Union(remainingZones).List()
1627+
sort.Strings(allZones)
1628+
var shiftIndex int
1629+
if len(prefZones) == 0 {
1630+
shiftIndex = rand.Intn(len(allZones))
1631+
} else {
1632+
shiftIndex = slices.Index(allZones, prefZones[0])
1633+
}
1634+
shiftedZones := append(allZones[shiftIndex:], allZones[:shiftIndex]...)
1635+
zones := make([]string, 0, numZones)
1636+
zones = append(zones, prefZones...)
1637+
zones = slices.Filter(zones, shiftedZones, func(v string) bool { return !prefSet.Has(v) })
1638+
return zones[:numZones], nil
16201639
}
16211640

16221641
func getZonesFromTopology(topList []*csi.Topology) ([]string, error) {
@@ -1652,11 +1671,11 @@ func getZoneFromSegment(seg map[string]string) (string, error) {
16521671
return zone, nil
16531672
}
16541673

1655-
func pickZones(ctx context.Context, gceCS *GCEControllerServer, top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
1674+
func (gceCS *GCEControllerServer) pickZones(ctx context.Context, top *csi.TopologyRequirement, numZones int, locationTopReq *locationRequirements) ([]string, error) {
16561675
var zones []string
16571676
var err error
16581677
if top != nil {
1659-
zones, err = pickZonesFromTopology(top, numZones, locationTopReq)
1678+
zones, err = pickZonesFromTopology(top, numZones, locationTopReq, gceCS.fallbackRequisiteZones)
16601679
if err != nil {
16611680
return nil, fmt.Errorf("failed to pick zones from topology: %w", err)
16621681
}

0 commit comments

Comments
 (0)