Skip to content

Commit e9f4c48

Browse files
Adjust Attach Limits for Gen3 Machines + Add Labels Override
1 parent a89f221 commit e9f4c48

File tree

5 files changed

+182
-3
lines changed

5 files changed

+182
-3
lines changed

pkg/common/constants.go

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ const (
6060
// Node label for Data Cache (only applicable to GKE nodes)
6161
NodeLabelPrefix = "cloud.google.com/%s"
6262
DataCacheLssdCountLabel = "gke-data-cache-disk"
63+
// Node label for attach limit override
64+
NodeRestrictionLabelPrefix = "node-restriction.kubernetes.io/%s"
65+
AttachLimitOverrideLabel = "gke-volume-attach-limit-override"
6366
)
6467

6568
// doc https://cloud.google.com/compute/docs/disks/hyperdisks#max-total-disks-per-vm

pkg/common/utils.go

+20
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"net/http"
2525
"regexp"
2626
"slices"
27+
"strconv"
2728
"strings"
2829
"time"
2930

@@ -772,6 +773,25 @@ func MapNumber(num int64) int64 {
772773
return 0
773774
}
774775

776+
func ExtractCPUFromMachineType(input string) (int64, error) {
777+
// Regex to find the number at the end of the string,
778+
// it allows optional -lssd suffix.
779+
re := regexp.MustCompile(`(\d+)(?:-lssd|-metal)?$`)
780+
781+
match := re.FindStringSubmatch(input)
782+
if len(match) < 2 {
783+
return 0, fmt.Errorf("no number found at the end of the input string: %s", input)
784+
}
785+
786+
numberStr := match[1]
787+
number, err := strconv.ParseInt(numberStr, 10, 64)
788+
if err != nil {
789+
return 0, fmt.Errorf("failed to convert string '%s' to integer: %w", numberStr, err)
790+
}
791+
792+
return number, nil
793+
}
794+
775795
func DiskTypeLabelKey(diskType string) string {
776796
return fmt.Sprintf("%s/%s", DiskTypeKeyPrefix, diskType)
777797
}

pkg/common/utils_test.go

+44
Original file line numberDiff line numberDiff line change
@@ -2149,3 +2149,47 @@ func TestGetMinIopsThroughput(t *testing.T) {
21492149
})
21502150
}
21512151
}
2152+
2153+
func TestExtractCPUFromMachineType(t *testing.T) {
2154+
testcases := []struct {
2155+
name string
2156+
input string
2157+
expectOutput int64
2158+
expectErr bool
2159+
}{
2160+
{
2161+
name: "c3-highmem-176",
2162+
input: "c3-highmem-176",
2163+
expectOutput: 176,
2164+
},
2165+
{
2166+
name: "c3-standard-8-lssd",
2167+
input: "c3-standard-8-lssd",
2168+
expectOutput: 8,
2169+
},
2170+
{
2171+
name: "c3-standard-192-metal",
2172+
input: "c3-standard-192-metal",
2173+
expectOutput: 192,
2174+
},
2175+
{
2176+
name: "invalid input",
2177+
input: "something-not-valid",
2178+
expectOutput: 0,
2179+
expectErr: true,
2180+
},
2181+
}
2182+
2183+
for _, tc := range testcases {
2184+
t.Run(tc.name, func(t *testing.T) {
2185+
output, err := ExtractCPUFromMachineType(tc.input)
2186+
if output != tc.expectOutput {
2187+
t.Errorf("ExtractCPUFromMachineType: got %v, want %v", output, tc.expectOutput)
2188+
}
2189+
2190+
if gotErr := err != nil; gotErr != tc.expectErr {
2191+
t.Fatalf("ExtractCPUFromMachineType(%+v) = %v; expectedErr: %v", tc.input, err, tc.expectErr)
2192+
}
2193+
})
2194+
}
2195+
}

pkg/gce-pd-csi-driver/node.go

+80-3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ import (
3131

3232
csi "github.com/container-storage-interface/spec/lib/go/csi"
3333

34+
"k8s.io/client-go/kubernetes"
35+
"k8s.io/client-go/rest"
3436
"k8s.io/klog/v2"
3537
"k8s.io/mount-utils"
3638

@@ -101,7 +103,12 @@ const (
101103
// doc https://cloud.google.com/compute/docs/memory-optimized-machines#x4_disks
102104
x4HyperdiskLimit int64 = 39
103105
// doc https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-disks
104-
a4HyperdiskLimit int64 = 127
106+
a4HyperdiskLimit int64 = 127
107+
// doc https://cloud.google.com/compute/docs/storage-optimized-machines#z3_disks
108+
// doc https://cloud.google.com/compute/docs/accelerator-optimized-machines#a3-disks
109+
gen3HyperdiskLimit int64 = 31
110+
// doc https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks
111+
h3HyperdiskLimit int64 = 7 // Use limit for Hyperdisk Balanced
105112
defaultLinuxFsType = "ext4"
106113
defaultWindowsFsType = "ntfs"
107114
fsTypeExt3 = "ext3"
@@ -571,7 +578,7 @@ func (ns *GCENodeServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRe
571578

572579
nodeID := common.CreateNodeID(ns.MetadataService.GetProject(), ns.MetadataService.GetZone(), ns.MetadataService.GetName())
573580

574-
volumeLimits, err := ns.GetVolumeLimits()
581+
volumeLimits, err := ns.GetVolumeLimits(ctx)
575582
if err != nil {
576583
klog.Errorf("GetVolumeLimits failed: %v", err.Error())
577584
}
@@ -731,7 +738,7 @@ func (ns *GCENodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpa
731738
}, nil
732739
}
733740

734-
func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
741+
func (ns *GCENodeServer) GetVolumeLimits(ctx context.Context) (int64, error) {
735742
// Machine-type format: n1-type-CPUS or custom-CPUS-RAM or f1/g1-type
736743
machineType := ns.MetadataService.GetMachineType()
737744

@@ -741,6 +748,22 @@ func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
741748
return volumeLimitSmall, nil
742749
}
743750
}
751+
752+
// Get attach limit override from label
753+
attachLimitOverride, err := GetAttachLimitsOverrideFromNodeLabel(ctx, ns.MetadataService.GetName())
754+
if err == nil && attachLimitOverride > 0 && attachLimitOverride < 128 {
755+
return attachLimitOverride, nil
756+
} else {
757+
// If there is an error or the range is not valid, still proceed to get defaults for the machine type
758+
if err != nil {
759+
klog.Warningf("using default value due to err getting node-restriction.kubernetes.io/gke-volume-attach-limit-override: %v", err)
760+
}
761+
if attachLimitOverride != 0 {
762+
klog.Warningf("using default value due to invalid node-restriction.kubernetes.io/gke-volume-attach-limit-override: %d", attachLimitOverride)
763+
}
764+
}
765+
766+
// Process gen4 machine attach limits
744767
gen4MachineTypesPrefix := []string{"c4a-", "c4-", "n4-"}
745768
for _, gen4Prefix := range gen4MachineTypesPrefix {
746769
if strings.HasPrefix(machineType, gen4Prefix) {
@@ -760,5 +783,59 @@ func (ns *GCENodeServer) GetVolumeLimits() (int64, error) {
760783
}
761784
}
762785

786+
// Process gen3 machine attach limits
787+
gen3MachineTypesPrefix := []string{"c3-", "c3d-"}
788+
for _, gen3Prefix := range gen3MachineTypesPrefix {
789+
if strings.HasPrefix(machineType, gen3Prefix) {
790+
cpus, err := common.ExtractCPUFromMachineType(machineType)
791+
if err != nil {
792+
return volumeLimitSmall, err
793+
}
794+
if cpus <= 8 || strings.Contains(machineType, "metal") {
795+
return volumeLimitSmall, nil
796+
}
797+
return gen3HyperdiskLimit, nil
798+
799+
}
800+
if strings.HasPrefix(machineType, "z3-") {
801+
return gen3HyperdiskLimit, nil
802+
}
803+
if strings.HasPrefix(machineType, "h3-") {
804+
return h3HyperdiskLimit, nil
805+
}
806+
if strings.HasPrefix(machineType, "a3-") {
807+
if machineType == "a3-ultragpu-8g" {
808+
return volumeLimitBig, nil
809+
} else {
810+
return gen3HyperdiskLimit, nil
811+
}
812+
}
813+
814+
}
815+
763816
return volumeLimitBig, nil
764817
}
818+
819+
func GetAttachLimitsOverrideFromNodeLabel(ctx context.Context, nodeName string) (int64, error) {
820+
cfg, err := rest.InClusterConfig()
821+
if err != nil {
822+
return 0, err
823+
}
824+
kubeClient, err := kubernetes.NewForConfig(cfg)
825+
if err != nil {
826+
return 0, err
827+
}
828+
node, err := getNodeWithRetry(ctx, kubeClient, nodeName)
829+
if err != nil {
830+
return 0, err
831+
}
832+
if val, found := node.GetLabels()[fmt.Sprintf(common.NodeRestrictionLabelPrefix, common.AttachLimitOverrideLabel)]; found {
833+
attachLimitOverrideForNode, err := strconv.ParseInt(val, 10, 64)
834+
if err != nil {
835+
return 0, fmt.Errorf("error getting attach limit override from node label: %v", err)
836+
}
837+
klog.V(4).Infof("attach limit override for the node: %v", attachLimitOverrideForNode)
838+
return attachLimitOverrideForNode, nil
839+
}
840+
return 0, nil
841+
}

pkg/gce-pd-csi-driver/node_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,41 @@ func TestNodeGetVolumeLimits(t *testing.T) {
309309
machineType: "a4-highgpu-8g",
310310
expVolumeLimit: a4HyperdiskLimit,
311311
},
312+
{
313+
name: "z3-highmem-176",
314+
machineType: "z3-highmem-176",
315+
expVolumeLimit: gen3HyperdiskLimit,
316+
},
317+
{
318+
name: "h3-standard-88",
319+
machineType: "h3-standard-88",
320+
expVolumeLimit: h3HyperdiskLimit,
321+
},
322+
{
323+
name: "a3-ultragpu-8g",
324+
machineType: "a3-ultragpu-8g",
325+
expVolumeLimit: volumeLimitBig,
326+
},
327+
{
328+
name: "a3-megagpu-8g",
329+
machineType: "a3-megagpu-8g",
330+
expVolumeLimit: gen3HyperdiskLimit, // 31
331+
},
332+
{
333+
name: "c3d-highmem-8-lssd",
334+
machineType: "c3d-highmem-8-lssd",
335+
expVolumeLimit: volumeLimitSmall, // 15
336+
},
337+
{
338+
name: "c3-standard-192-metal",
339+
machineType: "c3-standard-192-metal",
340+
expVolumeLimit: volumeLimitSmall, // 15
341+
},
342+
{
343+
name: "c3-standard-176",
344+
machineType: "c3-standard-176",
345+
expVolumeLimit: gen3HyperdiskLimit, // 31
346+
},
312347
}
313348

314349
for _, tc := range testCases {

0 commit comments

Comments
 (0)