Skip to content

Commit 7f44bae

Browse files
committed
btrfs recalim on kernel v5.19+: use bg_reclaim_threshold
Add a sysfs knob `btrfs-allocation-{,meta}data-bg_reclaim_threshold`, which will do the equivalent of: ``` echo VALUE > /sys/fs/btrfs/FS-UUID/allocation/data/bg_reclaim_threshold ``` Or, in case of metadata, equivalently: ``` echo VALUE > /sys/fs/btrfs/FS-UUID/allocation/metadata/bg_reclaim_threshold ``` Where VALUE is a number between `0` and `99` inclusive. Adding it as a "special" mount option, similarly to `read_ahead_kb`, as that's quite convenient. Some resources about `bg_reclaim_threshold` and more broadly balancing of the btrfs filesystem: - https://btrfs.readthedocs.io/en/latest/Administration.html#uuid-allocations-data-metadata-system - https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=18bb8bbf13c1839b43c9e09e76d397b753989af2 - https://lwn.net/Articles/978826/ Linux v6.11+, this may obsolete `bg_reclaim_threshold`. Author's interpretation ----------------------- The higher the reclaim threshold, the more accurately btrfs will show unused space (`Device Unallocated` row of `btrfs filesystem usage`) at the expense of sometimes needlessly moving data around. The lower the threshold, the less rebalancing, the less accurate metrics of the remaining space. The author of this commit prefers more IO in order to see more accurate `Device Unallocated` metrics, and therefore sets `btrfs-allocation-data-bg_reclaim_threshold=90`.
1 parent 68021f0 commit 7f44bae

File tree

6 files changed

+237
-12
lines changed

6 files changed

+237
-12
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,13 @@ Controller-level and node-level deployments will both have priorityClassName set
9696

9797
As noted in [GCP PD documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver), `ext4` and `xfs` are officially supported. `btrfs` support is experimental:
9898
- As of writing, Ubuntu VM images support btrfs, but [COS does not](https://cloud.google.com/container-optimized-os/docs/concepts/supported-filesystems).
99-
- Early testers have observed CSI driver OOMs when mounting larger (1TiB+) btrfs volumes under default memory constraints. The default constraint, as of writing, is 50MiB.
99+
100+
`btrfs` filesystem accepts two "special" mount options:
101+
102+
- `btrfs-data-bg_reclaim_threshold`
103+
- `btrfs-metadata-bg_reclaim_threshold`
104+
105+
Which writes to `/sys/fs/btrfs/FS-UUID/allocation/{,meta}data/bg_reclaim_threshold`, as documented [in btrfs docs](https://btrfs.readthedocs.io/en/latest/ch-sysfs.html#uuid-allocations-data-metadata-system).
100106

101107
## Further Documentation
102108

cmd/gce-pd-csi-driver/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ func handle() {
265265
DeviceInUseTimeout: *deviceInUseTimeout,
266266
EnableDataCache: *enableDataCacheFlag,
267267
DataCacheEnabledNodePool: isDataCacheEnabledNodePool,
268+
SysfsPath: "/sys",
268269
}
269270
nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs)
270271

pkg/gce-pd-csi-driver/gce-pd-driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi
157157
deviceInUseErrors: newDeviceErrMap(args.DeviceInUseTimeout),
158158
EnableDataCache: args.EnableDataCache,
159159
DataCacheEnabledNodePool: args.DataCacheEnabledNodePool,
160+
SysfsPath: args.SysfsPath,
160161
}
161162
}
162163

pkg/gce-pd-csi-driver/node.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License.
1515
package gceGCEDriver
1616

1717
import (
18+
"bytes"
1819
"context"
1920
"errors"
2021
"fmt"
@@ -51,6 +52,7 @@ type GCENodeServer struct {
5152
MetadataService metadataservice.MetadataService
5253
EnableDataCache bool
5354
DataCacheEnabledNodePool bool
55+
SysfsPath string
5456

5557
// A map storing all volumes with ongoing operations so that additional operations
5658
// for that same volume (as defined by VolumeID) return an Aborted error
@@ -87,6 +89,9 @@ type NodeServerArgs struct {
8789
EnableDataCache bool
8890

8991
DataCacheEnabledNodePool bool
92+
93+
// SysfsPath defaults to "/sys", except if it's a unit test.
94+
SysfsPath string
9095
}
9196

9297
var _ csi.NodeServer = &GCENodeServer{}
@@ -112,12 +117,17 @@ const (
112117
defaultLinuxFsType = "ext4"
113118
defaultWindowsFsType = "ntfs"
114119
fsTypeExt3 = "ext3"
120+
fsTypeBtrfs = "btrfs"
115121

116122
readAheadKBMountFlagRegexPattern = "^read_ahead_kb=(.+)$"
123+
btrfsReclaimDataRegexPattern = "^btrfs-allocation-data-bg_reclaim_threshold=(\\d{1,2})$" // 0-99 are valid, incl. 00
124+
btrfsReclaimMetadataRegexPattern = "^btrfs-allocation-metadata-bg_reclaim_threshold=(\\d{1,2})$" // ditto ^
117125
)
118126

119127
var (
120128
readAheadKBMountFlagRegex = regexp.MustCompile(readAheadKBMountFlagRegexPattern)
129+
btrfsReclaimDataRegex = regexp.MustCompile(btrfsReclaimDataRegexPattern)
130+
btrfsReclaimMetadataRegex = regexp.MustCompile(btrfsReclaimMetadataRegexPattern)
121131
)
122132

123133
func getDefaultFsType() string {
@@ -390,6 +400,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
390400
// Part 3: Mount device to stagingTargetPath
391401
fstype := getDefaultFsType()
392402

403+
var btrfsReclaimData, btrfsReclaimMetadata string
393404
shouldUpdateReadAhead := false
394405
var readAheadKB int64
395406
options := []string{}
@@ -403,6 +414,10 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
403414
if err != nil {
404415
return nil, status.Errorf(codes.InvalidArgument, "failure parsing mount flags: %v", err.Error())
405416
}
417+
418+
if mnt.FsType == fsTypeBtrfs {
419+
btrfsReclaimData, btrfsReclaimMetadata = extractBtrfsReclaimFlags(mnt.MountFlags)
420+
}
406421
} else if blk := volumeCapability.GetBlock(); blk != nil {
407422
// Noop for Block NodeStageVolume
408423
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s, capability is block so this is a no-op", volumeID, stagingTargetPath)
@@ -454,10 +469,64 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
454469
}
455470
}
456471

472+
// Part 6: if configured, write sysfs values
473+
if !readonly {
474+
sysfs := map[string]string{}
475+
if btrfsReclaimData != "" {
476+
sysfs["allocation/data/bg_reclaim_threshold"] = btrfsReclaimData
477+
}
478+
if btrfsReclaimMetadata != "" {
479+
sysfs["allocation/metadata/bg_reclaim_threshold"] = btrfsReclaimMetadata
480+
}
481+
482+
if len(sysfs) > 0 {
483+
args := []string{"--match-tag", "UUID", "--output", "value", stagingTargetPath}
484+
cmd := ns.Mounter.Exec.Command("blkid", args...)
485+
var stderr bytes.Buffer
486+
cmd.SetStderr(&stderr)
487+
klog.V(4).Infof(
488+
"running %q for volume %s",
489+
strings.Join(append([]string{"blkid"}, args...), " "),
490+
volumeID,
491+
)
492+
uuid, err := cmd.Output()
493+
if err != nil {
494+
klog.Errorf("blkid failed for %s. stderr:\n%s", volumeID, stderr.String())
495+
return nil, status.Errorf(codes.Internal, "blkid failed: %v", err)
496+
}
497+
uuid = bytes.TrimRight(uuid, "\n")
498+
499+
for key, value := range sysfs {
500+
path := fmt.Sprintf("%s/fs/btrfs/%s/%s", ns.SysfsPath, uuid, key)
501+
if err := writeSysfs(path, value); err != nil {
502+
return nil, status.Error(codes.Internal, err.Error())
503+
}
504+
klog.V(4).Infof("NodeStageVolume set %s %s=%s", volumeID, key, value)
505+
}
506+
}
507+
}
508+
457509
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s", volumeID, stagingTargetPath)
458510
return &csi.NodeStageVolumeResponse{}, nil
459511
}
460512

513+
func writeSysfs(path, value string) (_err error) {
514+
f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, 0o644)
515+
if err != nil {
516+
return err
517+
}
518+
519+
defer func() {
520+
_err = errors.Join(_err, f.Close())
521+
}()
522+
523+
if _, err := f.Write([]byte(value)); err != nil {
524+
return err
525+
}
526+
527+
return nil
528+
}
529+
461530
func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) error {
462531
isBlock, err := ns.VolumeStatter.IsBlockDevice(devicePath)
463532
if err != nil {
@@ -474,6 +543,18 @@ func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) e
474543
return nil
475544
}
476545

546+
func extractBtrfsReclaimFlags(mountFlags []string) (string, string) {
547+
var reclaimData, reclaimMetadata string
548+
for _, mountFlag := range mountFlags {
549+
if got := btrfsReclaimDataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
550+
reclaimData = got[1]
551+
} else if got := btrfsReclaimMetadataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
552+
reclaimMetadata = got[1]
553+
}
554+
}
555+
return reclaimData, reclaimMetadata
556+
}
557+
477558
func extractReadAheadKBMountFlag(mountFlags []string) (int64, bool, error) {
478559
for _, mountFlag := range mountFlags {
479560
if readAheadKB := readAheadKBMountFlagRegex.FindStringSubmatch(mountFlag); len(readAheadKB) == 2 {

pkg/gce-pd-csi-driver/node_test.go

Lines changed: 139 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -537,18 +537,34 @@ func TestNodeStageVolume(t *testing.T) {
537537
defer os.RemoveAll(tempDir)
538538
stagingPath := filepath.Join(tempDir, defaultStagingPath)
539539

540+
btrfsUUID := "00000000-0000-0000-0000-000000000001"
541+
btrfsPrefix := fmt.Sprintf("%s/sys/fs/btrfs/%s/allocation", tempDir, btrfsUUID)
542+
543+
for _, suffix := range []string{"data", "metadata"} {
544+
dir := btrfsPrefix + "/" + suffix
545+
if err := os.MkdirAll(dir, 0755); err != nil {
546+
t.Fatalf("Failed to set up fake sysfs dir %q: %v", dir, err)
547+
}
548+
fname := dir + "/bg_reclaim_threshold"
549+
if err := os.WriteFile(fname, []byte("0\n"), 0644); err != nil {
550+
t.Fatalf("write %q: %v", fname, err)
551+
}
552+
}
553+
540554
testCases := []struct {
541-
name string
542-
req *csi.NodeStageVolumeRequest
543-
deviceSize int
544-
blockExtSize int
545-
readonlyBit string
546-
expResize bool
547-
expReadAheadUpdate bool
548-
expReadAheadKB string
549-
readAheadSectors string
550-
sectorSizeInBytes int
551-
expErrCode codes.Code
555+
name string
556+
req *csi.NodeStageVolumeRequest
557+
deviceSize int
558+
blockExtSize int
559+
readonlyBit string
560+
expResize bool
561+
expReadAheadUpdate bool
562+
expReadAheadKB string
563+
readAheadSectors string
564+
btrfsReclaimData string
565+
btrfsReclaimMetadata string
566+
sectorSizeInBytes int
567+
expErrCode codes.Code
552568
}{
553569
{
554570
name: "Valid request, no resize because block and filesystem sizes match",
@@ -598,6 +614,76 @@ func TestNodeStageVolume(t *testing.T) {
598614
readonlyBit: "0",
599615
expResize: false,
600616
},
617+
{
618+
name: "btrfs-allocation-data-bg_reclaim_threshold is ignored on non-btrfs",
619+
req: &csi.NodeStageVolumeRequest{
620+
VolumeId: volumeID,
621+
StagingTargetPath: stagingPath,
622+
VolumeCapability: &csi.VolumeCapability{
623+
AccessType: &csi.VolumeCapability_Mount{
624+
Mount: &csi.VolumeCapability_MountVolume{
625+
FsType: "ext4",
626+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
627+
},
628+
},
629+
AccessMode: &csi.VolumeCapability_AccessMode{
630+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
631+
},
632+
},
633+
},
634+
deviceSize: 1,
635+
blockExtSize: 1,
636+
readonlyBit: "0",
637+
btrfsReclaimData: "0",
638+
},
639+
{
640+
name: "Valid request, set btrfs-allocation-data-bg_reclaim_threshold=90",
641+
req: &csi.NodeStageVolumeRequest{
642+
VolumeId: volumeID,
643+
StagingTargetPath: stagingPath,
644+
VolumeCapability: &csi.VolumeCapability{
645+
AccessType: &csi.VolumeCapability_Mount{
646+
Mount: &csi.VolumeCapability_MountVolume{
647+
FsType: "btrfs",
648+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
649+
},
650+
},
651+
AccessMode: &csi.VolumeCapability_AccessMode{
652+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
653+
},
654+
},
655+
},
656+
deviceSize: 1,
657+
blockExtSize: 1,
658+
readonlyBit: "0",
659+
btrfsReclaimData: "90",
660+
},
661+
{
662+
name: "Valid request, set btrfs-allocation-{,meta}data-bg_reclaim_threshold",
663+
req: &csi.NodeStageVolumeRequest{
664+
VolumeId: volumeID,
665+
StagingTargetPath: stagingPath,
666+
VolumeCapability: &csi.VolumeCapability{
667+
AccessType: &csi.VolumeCapability_Mount{
668+
Mount: &csi.VolumeCapability_MountVolume{
669+
FsType: "btrfs",
670+
MountFlags: []string{
671+
"btrfs-allocation-data-bg_reclaim_threshold=90",
672+
"btrfs-allocation-metadata-bg_reclaim_threshold=91",
673+
},
674+
},
675+
},
676+
AccessMode: &csi.VolumeCapability_AccessMode{
677+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
678+
},
679+
},
680+
},
681+
deviceSize: 1,
682+
blockExtSize: 1,
683+
readonlyBit: "0",
684+
btrfsReclaimData: "90",
685+
btrfsReclaimMetadata: "91",
686+
},
601687
{
602688
name: "Valid request, update readahead",
603689
req: &csi.NodeStageVolumeRequest{
@@ -730,6 +816,7 @@ func TestNodeStageVolume(t *testing.T) {
730816
t.Logf("Test case: %s", tc.name)
731817
resizeCalled := false
732818
readAheadUpdateCalled := false
819+
blkidCalled := false
733820
actionList := []testingexec.FakeCommandAction{
734821
makeFakeCmd(
735822
&testingexec.FakeCmd{
@@ -853,9 +940,26 @@ func TestNodeStageVolume(t *testing.T) {
853940
),
854941
}...)
855942
}
943+
if tc.btrfsReclaimData != "" || tc.btrfsReclaimMetadata != "" {
944+
actionList = append(actionList, []testingexec.FakeCommandAction{
945+
makeFakeCmd(
946+
&testingexec.FakeCmd{
947+
OutputScript: []testingexec.FakeAction{
948+
func() ([]byte, []byte, error) {
949+
blkidCalled = true
950+
return []byte(btrfsUUID + "\n"), nil, nil
951+
},
952+
},
953+
},
954+
"blkid",
955+
[]string{"--match-tag", "UUID", "--output", "value", stagingPath}...,
956+
),
957+
}...)
958+
}
856959
mounter := mountmanager.NewFakeSafeMounterWithCustomExec(&testingexec.FakeExec{CommandScript: actionList})
857960
gceDriver := getTestGCEDriverWithCustomMounter(t, mounter)
858961
ns := gceDriver.ns
962+
ns.SysfsPath = tempDir + "/sys"
859963
_, err := ns.NodeStageVolume(context.Background(), tc.req)
860964
if err != nil {
861965
serverError, ok := status.FromError(err)
@@ -882,6 +986,30 @@ func TestNodeStageVolume(t *testing.T) {
882986
if tc.expReadAheadUpdate == false && readAheadUpdateCalled == true {
883987
t.Fatalf("Test updated read ahead, but it was not expected.")
884988
}
989+
if tc.btrfsReclaimData == "" && tc.btrfsReclaimMetadata == "" && blkidCalled {
990+
t.Fatalf("blkid was called, but was not expected.")
991+
}
992+
993+
if tc.btrfsReclaimData != "" {
994+
fname := btrfsPrefix + "/data/bg_reclaim_threshold"
995+
got, err := os.ReadFile(fname)
996+
if err != nil {
997+
t.Fatalf("read %q: %v", fname, err)
998+
}
999+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimData {
1000+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimData, s)
1001+
}
1002+
}
1003+
if tc.btrfsReclaimMetadata != "" {
1004+
fname := btrfsPrefix + "/metadata/bg_reclaim_threshold"
1005+
got, err := os.ReadFile(fname)
1006+
if err != nil {
1007+
t.Fatalf("read %q: %v", fname, err)
1008+
}
1009+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimMetadata {
1010+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimMetadata, s)
1011+
}
1012+
}
8851013
}
8861014
}
8871015

pkg/gce-pd-csi-driver/utils.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ func collectMountOptions(fsType string, mntFlags []string) []string {
311311
// passed directly as an option to the mount command.
312312
continue
313313
}
314+
315+
if btrfsReclaimDataRegex.FindString(opt) != "" {
316+
continue
317+
}
318+
if btrfsReclaimMetadataRegex.FindString(opt) != "" {
319+
continue
320+
}
321+
314322
options = append(options, opt)
315323
}
316324

0 commit comments

Comments
 (0)