Skip to content

Commit 21128cc

Browse files
k8s-ci-robotsunnylovestiramisu
authored andcommitted
Merge pull request kubernetes-sigs#2072 from sunnylovestiramisu/attachLimit
Adjust Attach Limits for Gen3 Machines + Add Labels Override
1 parent 4f71e5e commit 21128cc

File tree

5 files changed

+182
-3
lines changed

5 files changed

+182
-3
lines changed

pkg/common/constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ const (
5656
// Node label for Data Cache (only applicable to GKE nodes)
5757
NodeLabelPrefix = "cloud.google.com/%s"
5858
DataCacheLssdCountLabel = "gke-data-cache-disk"
59+
// Node label for attach limit override
60+
NodeRestrictionLabelPrefix = "node-restriction.kubernetes.io/%s"
61+
AttachLimitOverrideLabel = "gke-volume-attach-limit-override"
5962
)
6063

6164
// doc https://cloud.google.com/compute/docs/disks/hyperdisks#max-total-disks-per-vm

pkg/common/utils.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"net/http"
2525
"regexp"
2626
"slices"
27+
"strconv"
2728
"strings"
2829
"time"
2930

@@ -772,6 +773,25 @@ func MapNumber(num int64) int64 {
772773
return 0
773774
}
774775

776+
func ExtractCPUFromMachineType(input string) (int64, error) {
777+
// Regex to find the number at the end of the string,
778+
// it allows optional -lssd suffix.
779+
re := regexp.MustCompile(`(\d+)(?:-lssd|-metal)?$`)
780+
781+
match := re.FindStringSubmatch(input)
782+
if len(match) < 2 {
783+
return 0, fmt.Errorf("no number found at the end of the input string: %s", input)
784+
}
785+
786+
numberStr := match[1]
787+
number, err := strconv.ParseInt(numberStr, 10, 64)
788+
if err != nil {
789+
return 0, fmt.Errorf("failed to convert string '%s' to integer: %w", numberStr, err)
790+
}
791+
792+
return number, nil
793+
}
794+
775795
// IsUpdateIopsThroughputValuesAllowed checks if a disk type is hyperdisk,
776796
// which implies that IOPS and throughput values can be updated.
777797
func IsUpdateIopsThroughputValuesAllowed(disk *computev1.Disk) bool {

pkg/common/utils_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,3 +2149,47 @@ func TestGetMinIopsThroughput(t *testing.T) {
21492149
})
21502150
}
21512151
}
2152+
2153+
func TestExtractCPUFromMachineType(t *testing.T) {
2154+
testcases := []struct {
2155+
name string
2156+
input string
2157+
expectOutput int64
2158+
expectErr bool
2159+
}{
2160+
{
2161+
name: "c3-highmem-176",
2162+
input: "c3-highmem-176",
2163+
expectOutput: 176,
2164+
},
2165+
{
2166+
name: "c3-standard-8-lssd",
2167+
input: "c3-standard-8-lssd",
2168+
expectOutput: 8,
2169+
},
2170+
{
2171+
name: "c3-standard-192-metal",
2172+
input: "c3-standard-192-metal",
2173+
expectOutput: 192,
2174+
},
2175+
{
2176+
name: "invalid input",
2177+
input: "something-not-valid",
2178+
expectOutput: 0,
2179+
expectErr: true,
2180+
},
2181+
}
2182+
2183+
for _, tc := range testcases {
2184+
t.Run(tc.name, func(t *testing.T) {
2185+
output, err := ExtractCPUFromMachineType(tc.input)
2186+
if output != tc.expectOutput {
2187+
t.Errorf("ExtractCPUFromMachineType: got %v, want %v", output, tc.expectOutput)
2188+
}
2189+
2190+
if gotErr := err != nil; gotErr != tc.expectErr {
2191+
t.Fatalf("ExtractCPUFromMachineType(%+v) = %v; expectedErr: %v", tc.input, err, tc.expectErr)
2192+
}
2193+
})
2194+
}
2195+
}

pkg/gce-pd-csi-driver/node.go

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ import (
3131

3232
csi "github.com/container-storage-interface/spec/lib/go/csi"
3333

34+
"k8s.io/client-go/kubernetes"
35+
"k8s.io/client-go/rest"
3436
"k8s.io/klog/v2"
3537
"k8s.io/mount-utils"
3638

@@ -101,7 +103,12 @@ const (
101103
// doc https://cloud.google.com/compute/docs/memory-optimized-machines#x4_disks
102104
x4HyperdiskLimit int64 = 39
103105
// doc https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-disks
104-
a4HyperdiskLimit int64 = 127
106+
a4HyperdiskLimit int64 = 127
107+
// doc https://cloud.google.com/compute/docs/storage-optimized-machines#z3_disks
108+
// doc https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-disks
109+
gen3HyperdiskLimit int64 = 31
110+
// doc https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
111+
h3HyperdiskLimit int64 = 7 // Use limit for Hyperdisk Balanced
105112
defaultLinuxFsType = "ext4"
106113
defaultWindowsFsType = "ntfs"
107114
fsTypeExt3 = "ext3"
@@ -571,7 +578,7 @@ func (ns *GCENodeServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRe
571578

572579
nodeID := common.CreateNodeID(ns.MetadataService.GetProject(), ns.MetadataService.GetZone(), ns.MetadataService.GetName())
573580

574-
volumeLimits, err := ns.GetVolumeLimits()
581+
volumeLimits, err := ns.GetVolumeLimits(ctx)
575582
if err != nil {
576583
klog.Errorf("GetVolumeLimits failed: %v. The error is ignored so that the driver can register", err.Error())
577584
// No error should be returned from NodeGetInfo, otherwise the driver will not register
@@ -733,7 +740,7 @@ func (ns *GCENodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpa
733740
}, nil
734741
}
735742

736-
func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
743+
func (ns *GCENodeServer) GetVolumeLimits(ctx context.Context) (int64, error) {
737744
// Machine-type format: n1-type-CPUS or custom-CPUS-RAM or f1/g1-type
738745
machineType := ns.MetadataService.GetMachineType()
739746

@@ -743,6 +750,22 @@ func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
743750
return volumeLimitSmall, nil
744751
}
745752
}
753+
754+
// Get attach limit override from label
755+
attachLimitOverride, err := GetAttachLimitsOverrideFromNodeLabel(ctx, ns.MetadataService.GetName())
756+
if err == nil && attachLimitOverride > 0 && attachLimitOverride < 128 {
757+
return attachLimitOverride, nil
758+
} else {
759+
// If there is an error or the range is not valid, still proceed to get defaults for the machine type
760+
if err != nil {
761+
klog.Warningf("using default value due to err getting node-restriction.kubernetes.io/gke-volume-attach-limit-override: %v", err)
762+
}
763+
if attachLimitOverride != 0 {
764+
klog.Warningf("using default value due to invalid node-restriction.kubernetes.io/gke-volume-attach-limit-override: %d", attachLimitOverride)
765+
}
766+
}
767+
768+
// Process gen4 machine attach limits
746769
gen4MachineTypesPrefix := []string{"c4a-", "c4-", "n4-"}
747770
for _, gen4Prefix := range gen4MachineTypesPrefix {
748771
if strings.HasPrefix(machineType, gen4Prefix) {
@@ -766,5 +789,59 @@ func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
766789
}
767790
}
768791

792+
// Process gen3 machine attach limits
793+
gen3MachineTypesPrefix := []string{"c3-", "c3d-"}
794+
for _, gen3Prefix := range gen3MachineTypesPrefix {
795+
if strings.HasPrefix(machineType, gen3Prefix) {
796+
cpus, err := common.ExtractCPUFromMachineType(machineType)
797+
if err != nil {
798+
return volumeLimitSmall, err
799+
}
800+
if cpus <= 8 || strings.Contains(machineType, "metal") {
801+
return volumeLimitSmall, nil
802+
}
803+
return gen3HyperdiskLimit, nil
804+
805+
}
806+
if strings.HasPrefix(machineType, "z3-") {
807+
return gen3HyperdiskLimit, nil
808+
}
809+
if strings.HasPrefix(machineType, "h3-") {
810+
return h3HyperdiskLimit, nil
811+
}
812+
if strings.HasPrefix(machineType, "a3-") {
813+
if machineType == "a3-ultragpu-8g" {
814+
return volumeLimitBig, nil
815+
} else {
816+
return gen3HyperdiskLimit, nil
817+
}
818+
}
819+
820+
}
821+
769822
return volumeLimitBig, nil
770823
}
824+
825+
func GetAttachLimitsOverrideFromNodeLabel(ctx context.Context, nodeName string) (int64, error) {
826+
cfg, err := rest.InClusterConfig()
827+
if err != nil {
828+
return 0, err
829+
}
830+
kubeClient, err := kubernetes.NewForConfig(cfg)
831+
if err != nil {
832+
return 0, err
833+
}
834+
node, err := getNodeWithRetry(ctx, kubeClient, nodeName)
835+
if err != nil {
836+
return 0, err
837+
}
838+
if val, found := node.GetLabels()[fmt.Sprintf(common.NodeRestrictionLabelPrefix, common.AttachLimitOverrideLabel)]; found {
839+
attachLimitOverrideForNode, err := strconv.ParseInt(val, 10, 64)
840+
if err != nil {
841+
return 0, fmt.Errorf("error getting attach limit override from node label: %v", err)
842+
}
843+
klog.V(4).Infof("attach limit override for the node: %v", attachLimitOverrideForNode)
844+
return attachLimitOverrideForNode, nil
845+
}
846+
return 0, nil
847+
}

pkg/gce-pd-csi-driver/node_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,41 @@ func TestNodeGetVolumeLimits(t *testing.T) {
313313
machineType: "a4-highgpu-8g",
314314
expVolumeLimit: a4HyperdiskLimit,
315315
},
316+
{
317+
name: "z3-highmem-176",
318+
machineType: "z3-highmem-176",
319+
expVolumeLimit: gen3HyperdiskLimit,
320+
},
321+
{
322+
name: "h3-standard-88",
323+
machineType: "h3-standard-88",
324+
expVolumeLimit: h3HyperdiskLimit,
325+
},
326+
{
327+
name: "a3-ultragpu-8g",
328+
machineType: "a3-ultragpu-8g",
329+
expVolumeLimit: volumeLimitBig,
330+
},
331+
{
332+
name: "a3-megagpu-8g",
333+
machineType: "a3-megagpu-8g",
334+
expVolumeLimit: gen3HyperdiskLimit, // 31
335+
},
336+
{
337+
name: "c3d-highmem-8-lssd",
338+
machineType: "c3d-highmem-8-lssd",
339+
expVolumeLimit: volumeLimitSmall, // 15
340+
},
341+
{
342+
name: "c3-standard-192-metal",
343+
machineType: "c3-standard-192-metal",
344+
expVolumeLimit: volumeLimitSmall, // 15
345+
},
346+
{
347+
name: "c3-standard-176",
348+
machineType: "c3-standard-176",
349+
expVolumeLimit: gen3HyperdiskLimit, // 31
350+
},
316351
}
317352

318353
for _, tc := range testCases {

0 commit comments

Comments
 (0)