Skip to content

Commit 8e820cd

Browse files
authored
Merge pull request kubernetes-sigs#2091 from motiejus/public_btrfs-reclaim
btrfs recalim on kernel v5.19+: use bg_reclaim_threshold
2 parents 68021f0 + 7f44bae commit 8e820cd

File tree

6 files changed

+237
-12
lines changed

6 files changed

+237
-12
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,13 @@ Controller-level and node-level deployments will both have priorityClassName set
9696

9797
As noted in [GCP PD documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver), `ext4` and `xfs` are officially supported. `btrfs` support is experimental:
9898
- As of writing, Ubuntu VM images support btrfs, but [COS does not](https://cloud.google.com/container-optimized-os/docs/concepts/supported-filesystems).
99-
- Early testers have observed CSI driver OOMs when mounting larger (1TiB+) btrfs volumes under default memory constraints. The default constraint, as of writing, is 50MiB.
99+
100+
`btrfs` filesystem accepts two "special" mount options:
101+
102+
- `btrfs-data-bg_reclaim_threshold`
103+
- `btrfs-metadata-bg_reclaim_threshold`
104+
105+
Which writes to `/sys/fs/btrfs/FS-UUID/allocation/{,meta}data/bg_reclaim_threshold`, as documented [in btrfs docs](https://btrfs.readthedocs.io/en/latest/ch-sysfs.html#uuid-allocations-data-metadata-system).
100106

101107
## Further Documentation
102108

cmd/gce-pd-csi-driver/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ func handle() {
265265
DeviceInUseTimeout: *deviceInUseTimeout,
266266
EnableDataCache: *enableDataCacheFlag,
267267
DataCacheEnabledNodePool: isDataCacheEnabledNodePool,
268+
SysfsPath: "/sys",
268269
}
269270
nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs)
270271

pkg/gce-pd-csi-driver/gce-pd-driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi
157157
deviceInUseErrors: newDeviceErrMap(args.DeviceInUseTimeout),
158158
EnableDataCache: args.EnableDataCache,
159159
DataCacheEnabledNodePool: args.DataCacheEnabledNodePool,
160+
SysfsPath: args.SysfsPath,
160161
}
161162
}
162163

pkg/gce-pd-csi-driver/node.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License.
1515
package gceGCEDriver
1616

1717
import (
18+
"bytes"
1819
"context"
1920
"errors"
2021
"fmt"
@@ -51,6 +52,7 @@ type GCENodeServer struct {
5152
MetadataService metadataservice.MetadataService
5253
EnableDataCache bool
5354
DataCacheEnabledNodePool bool
55+
SysfsPath string
5456

5557
// A map storing all volumes with ongoing operations so that additional operations
5658
// for that same volume (as defined by VolumeID) return an Aborted error
@@ -87,6 +89,9 @@ type NodeServerArgs struct {
8789
EnableDataCache bool
8890

8991
DataCacheEnabledNodePool bool
92+
93+
// SysfsPath defaults to "/sys", except if it's a unit test.
94+
SysfsPath string
9095
}
9196

9297
var _ csi.NodeServer = &GCENodeServer{}
@@ -112,12 +117,17 @@ const (
112117
defaultLinuxFsType = "ext4"
113118
defaultWindowsFsType = "ntfs"
114119
fsTypeExt3 = "ext3"
120+
fsTypeBtrfs = "btrfs"
115121

116122
readAheadKBMountFlagRegexPattern = "^read_ahead_kb=(.+)$"
123+
btrfsReclaimDataRegexPattern = "^btrfs-allocation-data-bg_reclaim_threshold=(\\d{1,2})$" // 0-99 are valid, incl. 00
124+
btrfsReclaimMetadataRegexPattern = "^btrfs-allocation-metadata-bg_reclaim_threshold=(\\d{1,2})$" // ditto ^
117125
)
118126

119127
var (
120128
readAheadKBMountFlagRegex = regexp.MustCompile(readAheadKBMountFlagRegexPattern)
129+
btrfsReclaimDataRegex = regexp.MustCompile(btrfsReclaimDataRegexPattern)
130+
btrfsReclaimMetadataRegex = regexp.MustCompile(btrfsReclaimMetadataRegexPattern)
121131
)
122132

123133
func getDefaultFsType() string {
@@ -390,6 +400,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
390400
// Part 3: Mount device to stagingTargetPath
391401
fstype := getDefaultFsType()
392402

403+
var btrfsReclaimData, btrfsReclaimMetadata string
393404
shouldUpdateReadAhead := false
394405
var readAheadKB int64
395406
options := []string{}
@@ -403,6 +414,10 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
403414
if err != nil {
404415
return nil, status.Errorf(codes.InvalidArgument, "failure parsing mount flags: %v", err.Error())
405416
}
417+
418+
if mnt.FsType == fsTypeBtrfs {
419+
btrfsReclaimData, btrfsReclaimMetadata = extractBtrfsReclaimFlags(mnt.MountFlags)
420+
}
406421
} else if blk := volumeCapability.GetBlock(); blk != nil {
407422
// Noop for Block NodeStageVolume
408423
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s, capability is block so this is a no-op", volumeID, stagingTargetPath)
@@ -454,10 +469,64 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
454469
}
455470
}
456471

472+
// Part 6: if configured, write sysfs values
473+
if !readonly {
474+
sysfs := map[string]string{}
475+
if btrfsReclaimData != "" {
476+
sysfs["allocation/data/bg_reclaim_threshold"] = btrfsReclaimData
477+
}
478+
if btrfsReclaimMetadata != "" {
479+
sysfs["allocation/metadata/bg_reclaim_threshold"] = btrfsReclaimMetadata
480+
}
481+
482+
if len(sysfs) > 0 {
483+
args := []string{"--match-tag", "UUID", "--output", "value", stagingTargetPath}
484+
cmd := ns.Mounter.Exec.Command("blkid", args...)
485+
var stderr bytes.Buffer
486+
cmd.SetStderr(&stderr)
487+
klog.V(4).Infof(
488+
"running %q for volume %s",
489+
strings.Join(append([]string{"blkid"}, args...), " "),
490+
volumeID,
491+
)
492+
uuid, err := cmd.Output()
493+
if err != nil {
494+
klog.Errorf("blkid failed for %s. stderr:\n%s", volumeID, stderr.String())
495+
return nil, status.Errorf(codes.Internal, "blkid failed: %v", err)
496+
}
497+
uuid = bytes.TrimRight(uuid, "\n")
498+
499+
for key, value := range sysfs {
500+
path := fmt.Sprintf("%s/fs/btrfs/%s/%s", ns.SysfsPath, uuid, key)
501+
if err := writeSysfs(path, value); err != nil {
502+
return nil, status.Error(codes.Internal, err.Error())
503+
}
504+
klog.V(4).Infof("NodeStageVolume set %s %s=%s", volumeID, key, value)
505+
}
506+
}
507+
}
508+
457509
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s", volumeID, stagingTargetPath)
458510
return &csi.NodeStageVolumeResponse{}, nil
459511
}
460512

513+
func writeSysfs(path, value string) (_err error) {
514+
f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, 0o644)
515+
if err != nil {
516+
return err
517+
}
518+
519+
defer func() {
520+
_err = errors.Join(_err, f.Close())
521+
}()
522+
523+
if _, err := f.Write([]byte(value)); err != nil {
524+
return err
525+
}
526+
527+
return nil
528+
}
529+
461530
func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) error {
462531
isBlock, err := ns.VolumeStatter.IsBlockDevice(devicePath)
463532
if err != nil {
@@ -474,6 +543,18 @@ func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) e
474543
return nil
475544
}
476545

546+
func extractBtrfsReclaimFlags(mountFlags []string) (string, string) {
547+
var reclaimData, reclaimMetadata string
548+
for _, mountFlag := range mountFlags {
549+
if got := btrfsReclaimDataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
550+
reclaimData = got[1]
551+
} else if got := btrfsReclaimMetadataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
552+
reclaimMetadata = got[1]
553+
}
554+
}
555+
return reclaimData, reclaimMetadata
556+
}
557+
477558
func extractReadAheadKBMountFlag(mountFlags []string) (int64, bool, error) {
478559
for _, mountFlag := range mountFlags {
479560
if readAheadKB := readAheadKBMountFlagRegex.FindStringSubmatch(mountFlag); len(readAheadKB) == 2 {

pkg/gce-pd-csi-driver/node_test.go

Lines changed: 139 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -537,18 +537,34 @@ func TestNodeStageVolume(t *testing.T) {
537537
defer os.RemoveAll(tempDir)
538538
stagingPath := filepath.Join(tempDir, defaultStagingPath)
539539

540+
btrfsUUID := "00000000-0000-0000-0000-000000000001"
541+
btrfsPrefix := fmt.Sprintf("%s/sys/fs/btrfs/%s/allocation", tempDir, btrfsUUID)
542+
543+
for _, suffix := range []string{"data", "metadata"} {
544+
dir := btrfsPrefix + "/" + suffix
545+
if err := os.MkdirAll(dir, 0755); err != nil {
546+
t.Fatalf("Failed to set up fake sysfs dir %q: %v", dir, err)
547+
}
548+
fname := dir + "/bg_reclaim_threshold"
549+
if err := os.WriteFile(fname, []byte("0\n"), 0644); err != nil {
550+
t.Fatalf("write %q: %v", fname, err)
551+
}
552+
}
553+
540554
testCases := []struct {
541-
name string
542-
req *csi.NodeStageVolumeRequest
543-
deviceSize int
544-
blockExtSize int
545-
readonlyBit string
546-
expResize bool
547-
expReadAheadUpdate bool
548-
expReadAheadKB string
549-
readAheadSectors string
550-
sectorSizeInBytes int
551-
expErrCode codes.Code
555+
name string
556+
req *csi.NodeStageVolumeRequest
557+
deviceSize int
558+
blockExtSize int
559+
readonlyBit string
560+
expResize bool
561+
expReadAheadUpdate bool
562+
expReadAheadKB string
563+
readAheadSectors string
564+
btrfsReclaimData string
565+
btrfsReclaimMetadata string
566+
sectorSizeInBytes int
567+
expErrCode codes.Code
552568
}{
553569
{
554570
name: "Valid request, no resize because block and filesystem sizes match",
@@ -598,6 +614,76 @@ func TestNodeStageVolume(t *testing.T) {
598614
readonlyBit: "0",
599615
expResize: false,
600616
},
617+
{
618+
name: "btrfs-allocation-data-bg_reclaim_threshold is ignored on non-btrfs",
619+
req: &csi.NodeStageVolumeRequest{
620+
VolumeId: volumeID,
621+
StagingTargetPath: stagingPath,
622+
VolumeCapability: &csi.VolumeCapability{
623+
AccessType: &csi.VolumeCapability_Mount{
624+
Mount: &csi.VolumeCapability_MountVolume{
625+
FsType: "ext4",
626+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
627+
},
628+
},
629+
AccessMode: &csi.VolumeCapability_AccessMode{
630+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
631+
},
632+
},
633+
},
634+
deviceSize: 1,
635+
blockExtSize: 1,
636+
readonlyBit: "0",
637+
btrfsReclaimData: "0",
638+
},
639+
{
640+
name: "Valid request, set btrfs-allocation-data-bg_reclaim_threshold=90",
641+
req: &csi.NodeStageVolumeRequest{
642+
VolumeId: volumeID,
643+
StagingTargetPath: stagingPath,
644+
VolumeCapability: &csi.VolumeCapability{
645+
AccessType: &csi.VolumeCapability_Mount{
646+
Mount: &csi.VolumeCapability_MountVolume{
647+
FsType: "btrfs",
648+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
649+
},
650+
},
651+
AccessMode: &csi.VolumeCapability_AccessMode{
652+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
653+
},
654+
},
655+
},
656+
deviceSize: 1,
657+
blockExtSize: 1,
658+
readonlyBit: "0",
659+
btrfsReclaimData: "90",
660+
},
661+
{
662+
name: "Valid request, set btrfs-allocation-{,meta}data-bg_reclaim_threshold",
663+
req: &csi.NodeStageVolumeRequest{
664+
VolumeId: volumeID,
665+
StagingTargetPath: stagingPath,
666+
VolumeCapability: &csi.VolumeCapability{
667+
AccessType: &csi.VolumeCapability_Mount{
668+
Mount: &csi.VolumeCapability_MountVolume{
669+
FsType: "btrfs",
670+
MountFlags: []string{
671+
"btrfs-allocation-data-bg_reclaim_threshold=90",
672+
"btrfs-allocation-metadata-bg_reclaim_threshold=91",
673+
},
674+
},
675+
},
676+
AccessMode: &csi.VolumeCapability_AccessMode{
677+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
678+
},
679+
},
680+
},
681+
deviceSize: 1,
682+
blockExtSize: 1,
683+
readonlyBit: "0",
684+
btrfsReclaimData: "90",
685+
btrfsReclaimMetadata: "91",
686+
},
601687
{
602688
name: "Valid request, update readahead",
603689
req: &csi.NodeStageVolumeRequest{
@@ -730,6 +816,7 @@ func TestNodeStageVolume(t *testing.T) {
730816
t.Logf("Test case: %s", tc.name)
731817
resizeCalled := false
732818
readAheadUpdateCalled := false
819+
blkidCalled := false
733820
actionList := []testingexec.FakeCommandAction{
734821
makeFakeCmd(
735822
&testingexec.FakeCmd{
@@ -853,9 +940,26 @@ func TestNodeStageVolume(t *testing.T) {
853940
),
854941
}...)
855942
}
943+
if tc.btrfsReclaimData != "" || tc.btrfsReclaimMetadata != "" {
944+
actionList = append(actionList, []testingexec.FakeCommandAction{
945+
makeFakeCmd(
946+
&testingexec.FakeCmd{
947+
OutputScript: []testingexec.FakeAction{
948+
func() ([]byte, []byte, error) {
949+
blkidCalled = true
950+
return []byte(btrfsUUID + "\n"), nil, nil
951+
},
952+
},
953+
},
954+
"blkid",
955+
[]string{"--match-tag", "UUID", "--output", "value", stagingPath}...,
956+
),
957+
}...)
958+
}
856959
mounter := mountmanager.NewFakeSafeMounterWithCustomExec(&testingexec.FakeExec{CommandScript: actionList})
857960
gceDriver := getTestGCEDriverWithCustomMounter(t, mounter)
858961
ns := gceDriver.ns
962+
ns.SysfsPath = tempDir + "/sys"
859963
_, err := ns.NodeStageVolume(context.Background(), tc.req)
860964
if err != nil {
861965
serverError, ok := status.FromError(err)
@@ -882,6 +986,30 @@ func TestNodeStageVolume(t *testing.T) {
882986
if tc.expReadAheadUpdate == false && readAheadUpdateCalled == true {
883987
t.Fatalf("Test updated read ahead, but it was not expected.")
884988
}
989+
if tc.btrfsReclaimData == "" && tc.btrfsReclaimMetadata == "" && blkidCalled {
990+
t.Fatalf("blkid was called, but was not expected.")
991+
}
992+
993+
if tc.btrfsReclaimData != "" {
994+
fname := btrfsPrefix + "/data/bg_reclaim_threshold"
995+
got, err := os.ReadFile(fname)
996+
if err != nil {
997+
t.Fatalf("read %q: %v", fname, err)
998+
}
999+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimData {
1000+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimData, s)
1001+
}
1002+
}
1003+
if tc.btrfsReclaimMetadata != "" {
1004+
fname := btrfsPrefix + "/metadata/bg_reclaim_threshold"
1005+
got, err := os.ReadFile(fname)
1006+
if err != nil {
1007+
t.Fatalf("read %q: %v", fname, err)
1008+
}
1009+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimMetadata {
1010+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimMetadata, s)
1011+
}
1012+
}
8851013
}
8861014
}
8871015

pkg/gce-pd-csi-driver/utils.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ func collectMountOptions(fsType string, mntFlags []string) []string {
311311
// passed directly as an option to the mount command.
312312
continue
313313
}
314+
315+
if btrfsReclaimDataRegex.FindString(opt) != "" {
316+
continue
317+
}
318+
if btrfsReclaimMetadataRegex.FindString(opt) != "" {
319+
continue
320+
}
321+
314322
options = append(options, opt)
315323
}
316324

0 commit comments

Comments
 (0)