Skip to content

Commit edc5a0a

Browse files
committed
btrfs recalim on kernel v5.19+: use bg_reclaim_threshold
Add a sysfs knob `btrfs-allocation-{,meta}data-bg_reclaim_threshold`, which will do the equivalent of: ``` echo VALUE > /sys/fs/btrfs/FS-UUID/allocation/data/bg_reclaim_threshold ``` Or, in case of metadata, equivalently: ``` echo VALUE > /sys/fs/btrfs/FS-UUID/allocation/metadata/bg_reclaim_threshold ``` Where VALUE is a number between `0` and `99` inclusive. Adding it as a "special" mount option, similarly to `read_ahead_kb`, as that's quite convenient. Some resources about `bg_reclaim_threshold` and more broadly balancing of the btrfs filesystem: - https://btrfs.readthedocs.io/en/latest/Administration.html#uuid-allocations-data-metadata-system - https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=18bb8bbf13c1839b43c9e09e76d397b753989af2 - https://lwn.net/Articles/978826/ Linux v6.11+, this may obsolete `bg_reclaim_threshold`. Author's interpretation ----------------------- The higher the reclaim threshold, the more accurately btrfs will show unused space (`Device Unallocated` row of `btrfs filesystem usage`) at the expense of sometimes needlessly moving data around. The lower the threshold, the less rebalancing, the less accurate metrics of the remaining space. The author of this commit prefers more IO in order to see more accurate `Device Unallocated` metrics, and therefore sets `btrfs-allocation-data-bg_reclaim_threshold=90`. Fixes #2088
1 parent 68021f0 commit edc5a0a

File tree

6 files changed

+236
-12
lines changed

6 files changed

+236
-12
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,13 @@ Controller-level and node-level deployments will both have priorityClassName set
9696

9797
As noted in [GCP PD documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver), `ext4` and `xfs` are officially supported. `btrfs` support is experimental:
9898
- As of writing, Ubuntu VM images support btrfs, but [COS does not](https://cloud.google.com/container-optimized-os/docs/concepts/supported-filesystems).
99-
- Early testers have observed CSI driver OOMs when mounting larger (1TiB+) btrfs volumes under default memory constraints. The default constraint, as of writing, is 50MiB.
99+
100+
`btrfs` filesystem accepts two "special" mount options:
101+
102+
- `btrfs-data-bg_reclaim_threshold`
103+
- `btrfs-metadata-bg_reclaim_threshold`
104+
105+
Which writes to `/sys/fs/btrfs/FS-UUID/allocation/{,meta}data/bg_reclaim_threshold`, as documented [in btrfs docs](https://btrfs.readthedocs.io/en/latest/ch-sysfs.html#uuid-allocations-data-metadata-system).
100106

101107
## Further Documentation
102108

cmd/gce-pd-csi-driver/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ func handle() {
265265
DeviceInUseTimeout: *deviceInUseTimeout,
266266
EnableDataCache: *enableDataCacheFlag,
267267
DataCacheEnabledNodePool: isDataCacheEnabledNodePool,
268+
SysfsPath: "/sys",
268269
}
269270
nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs)
270271

pkg/gce-pd-csi-driver/gce-pd-driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi
157157
deviceInUseErrors: newDeviceErrMap(args.DeviceInUseTimeout),
158158
EnableDataCache: args.EnableDataCache,
159159
DataCacheEnabledNodePool: args.DataCacheEnabledNodePool,
160+
SysfsPath: args.SysfsPath,
160161
}
161162
}
162163

pkg/gce-pd-csi-driver/node.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License.
1515
package gceGCEDriver
1616

1717
import (
18+
"bytes"
1819
"context"
1920
"errors"
2021
"fmt"
@@ -51,6 +52,7 @@ type GCENodeServer struct {
5152
MetadataService metadataservice.MetadataService
5253
EnableDataCache bool
5354
DataCacheEnabledNodePool bool
55+
SysfsPath string
5456

5557
// A map storing all volumes with ongoing operations so that additional operations
5658
// for that same volume (as defined by VolumeID) return an Aborted error
@@ -87,6 +89,9 @@ type NodeServerArgs struct {
8789
EnableDataCache bool
8890

8991
DataCacheEnabledNodePool bool
92+
93+
// SysfsPath defaults to "/sys", except if it's a unit test.
94+
SysfsPath string
9095
}
9196

9297
var _ csi.NodeServer = &GCENodeServer{}
@@ -114,10 +119,14 @@ const (
114119
fsTypeExt3 = "ext3"
115120

116121
readAheadKBMountFlagRegexPattern = "^read_ahead_kb=(.+)$"
122+
btrfsReclaimDataRegexPattern = "^btrfs-allocation-data-bg_reclaim_threshold=(\\d{1,2})$" // 0-99 are valid, incl. 00
123+
btrfsReclaimMetadataRegexPattern = "^btrfs-allocation-metadata-bg_reclaim_threshold=(\\d{1,2})$" // ditto ^
117124
)
118125

119126
var (
120127
readAheadKBMountFlagRegex = regexp.MustCompile(readAheadKBMountFlagRegexPattern)
128+
btrfsReclaimDataRegex = regexp.MustCompile(btrfsReclaimDataRegexPattern)
129+
btrfsReclaimMetadataRegex = regexp.MustCompile(btrfsReclaimMetadataRegexPattern)
121130
)
122131

123132
func getDefaultFsType() string {
@@ -390,6 +399,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
390399
// Part 3: Mount device to stagingTargetPath
391400
fstype := getDefaultFsType()
392401

402+
var btrfsReclaimData, btrfsReclaimMetadata string
393403
shouldUpdateReadAhead := false
394404
var readAheadKB int64
395405
options := []string{}
@@ -403,6 +413,10 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
403413
if err != nil {
404414
return nil, status.Errorf(codes.InvalidArgument, "failure parsing mount flags: %v", err.Error())
405415
}
416+
417+
if mnt.FsType == "btrfs" {
418+
btrfsReclaimData, btrfsReclaimMetadata = extractBtrfsReclaimFlags(mnt.MountFlags)
419+
}
406420
} else if blk := volumeCapability.GetBlock(); blk != nil {
407421
// Noop for Block NodeStageVolume
408422
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s, capability is block so this is a no-op", volumeID, stagingTargetPath)
@@ -454,10 +468,64 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage
454468
}
455469
}
456470

471+
// Part 6: if configured, write sysfs values
472+
if !readonly {
473+
sysfs := map[string]string{}
474+
if btrfsReclaimData != "" {
475+
sysfs["allocation/data/bg_reclaim_threshold"] = btrfsReclaimData
476+
}
477+
if btrfsReclaimMetadata != "" {
478+
sysfs["allocation/metadata/bg_reclaim_threshold"] = btrfsReclaimMetadata
479+
}
480+
481+
if len(sysfs) > 0 {
482+
args := []string{"--match-tag", "UUID", "--output", "value", stagingTargetPath}
483+
cmd := ns.Mounter.Exec.Command("blkid", args...)
484+
var stderr bytes.Buffer
485+
cmd.SetStderr(&stderr)
486+
klog.V(4).Infof(
487+
"running %q for volume %s",
488+
strings.Join(append([]string{"blkid"}, args...), " "),
489+
volumeID,
490+
)
491+
uuid, err := cmd.Output()
492+
if err != nil {
493+
klog.Errorf("blkid failed for %s. stderr:\n%s", volumeID, stderr.String())
494+
return nil, status.Errorf(codes.Internal, "blkid failed: %v", err)
495+
}
496+
uuid = bytes.TrimRight(uuid, "\n")
497+
498+
for key, value := range sysfs {
499+
path := fmt.Sprintf("%s/fs/btrfs/%s/%s", ns.SysfsPath, uuid, key)
500+
if err := writeSysfs(path, value); err != nil {
501+
return nil, status.Error(codes.Internal, err.Error())
502+
}
503+
klog.V(4).Infof("NodeStageVolume set %s %s=%s", volumeID, key, value)
504+
}
505+
}
506+
}
507+
457508
klog.V(4).Infof("NodeStageVolume succeeded on %v to %s", volumeID, stagingTargetPath)
458509
return &csi.NodeStageVolumeResponse{}, nil
459510
}
460511

512+
func writeSysfs(path, value string) (_err error) {
513+
f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, 0o644)
514+
if err != nil {
515+
return err
516+
}
517+
518+
defer func() {
519+
_err = errors.Join(_err, f.Close())
520+
}()
521+
522+
if _, err := f.Write([]byte(value)); err != nil {
523+
return err
524+
}
525+
526+
return nil
527+
}
528+
461529
func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) error {
462530
isBlock, err := ns.VolumeStatter.IsBlockDevice(devicePath)
463531
if err != nil {
@@ -474,6 +542,18 @@ func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) e
474542
return nil
475543
}
476544

545+
func extractBtrfsReclaimFlags(mountFlags []string) (string, string) {
546+
var reclaimData, reclaimMetadata string
547+
for _, mountFlag := range mountFlags {
548+
if got := btrfsReclaimDataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
549+
reclaimData = got[1]
550+
} else if got := btrfsReclaimMetadataRegex.FindStringSubmatch(mountFlag); len(got) == 2 {
551+
reclaimMetadata = got[1]
552+
}
553+
}
554+
return reclaimData, reclaimMetadata
555+
}
556+
477557
func extractReadAheadKBMountFlag(mountFlags []string) (int64, bool, error) {
478558
for _, mountFlag := range mountFlags {
479559
if readAheadKB := readAheadKBMountFlagRegex.FindStringSubmatch(mountFlag); len(readAheadKB) == 2 {

pkg/gce-pd-csi-driver/node_test.go

Lines changed: 139 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -537,18 +537,34 @@ func TestNodeStageVolume(t *testing.T) {
537537
defer os.RemoveAll(tempDir)
538538
stagingPath := filepath.Join(tempDir, defaultStagingPath)
539539

540+
btrfsUUID := "00000000-0000-0000-0000-000000000001"
541+
btrfsPrefix := fmt.Sprintf("%s/sys/fs/btrfs/%s/allocation", tempDir, btrfsUUID)
542+
543+
for _, suffix := range []string{"data", "metadata"} {
544+
dir := btrfsPrefix + "/" + suffix
545+
if err := os.MkdirAll(dir, 0755); err != nil {
546+
t.Fatalf("Failed to set up fake sysfs dir %q: %v", dir, err)
547+
}
548+
fname := dir + "/bg_reclaim_threshold"
549+
if err := os.WriteFile(fname, []byte("0\n"), 0644); err != nil {
550+
t.Fatalf("write %q: %v", fname, err)
551+
}
552+
}
553+
540554
testCases := []struct {
541-
name string
542-
req *csi.NodeStageVolumeRequest
543-
deviceSize int
544-
blockExtSize int
545-
readonlyBit string
546-
expResize bool
547-
expReadAheadUpdate bool
548-
expReadAheadKB string
549-
readAheadSectors string
550-
sectorSizeInBytes int
551-
expErrCode codes.Code
555+
name string
556+
req *csi.NodeStageVolumeRequest
557+
deviceSize int
558+
blockExtSize int
559+
readonlyBit string
560+
expResize bool
561+
expReadAheadUpdate bool
562+
expReadAheadKB string
563+
readAheadSectors string
564+
btrfsReclaimData string
565+
btrfsReclaimMetadata string
566+
sectorSizeInBytes int
567+
expErrCode codes.Code
552568
}{
553569
{
554570
name: "Valid request, no resize because block and filesystem sizes match",
@@ -598,6 +614,76 @@ func TestNodeStageVolume(t *testing.T) {
598614
readonlyBit: "0",
599615
expResize: false,
600616
},
617+
{
618+
name: "btrfs-allocation-data-bg_reclaim_threshold is ignored on non-btrfs",
619+
req: &csi.NodeStageVolumeRequest{
620+
VolumeId: volumeID,
621+
StagingTargetPath: stagingPath,
622+
VolumeCapability: &csi.VolumeCapability{
623+
AccessType: &csi.VolumeCapability_Mount{
624+
Mount: &csi.VolumeCapability_MountVolume{
625+
FsType: "ext4",
626+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
627+
},
628+
},
629+
AccessMode: &csi.VolumeCapability_AccessMode{
630+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
631+
},
632+
},
633+
},
634+
deviceSize: 1,
635+
blockExtSize: 1,
636+
readonlyBit: "0",
637+
btrfsReclaimData: "0",
638+
},
639+
{
640+
name: "Valid request, set btrfs-allocation-data-bg_reclaim_threshold=90",
641+
req: &csi.NodeStageVolumeRequest{
642+
VolumeId: volumeID,
643+
StagingTargetPath: stagingPath,
644+
VolumeCapability: &csi.VolumeCapability{
645+
AccessType: &csi.VolumeCapability_Mount{
646+
Mount: &csi.VolumeCapability_MountVolume{
647+
FsType: "btrfs",
648+
MountFlags: []string{"btrfs-allocation-data-bg_reclaim_threshold=90"},
649+
},
650+
},
651+
AccessMode: &csi.VolumeCapability_AccessMode{
652+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
653+
},
654+
},
655+
},
656+
deviceSize: 1,
657+
blockExtSize: 1,
658+
readonlyBit: "0",
659+
btrfsReclaimData: "90",
660+
},
661+
{
662+
name: "Valid request, set btrfs-allocation-{,meta}data-bg_reclaim_threshold",
663+
req: &csi.NodeStageVolumeRequest{
664+
VolumeId: volumeID,
665+
StagingTargetPath: stagingPath,
666+
VolumeCapability: &csi.VolumeCapability{
667+
AccessType: &csi.VolumeCapability_Mount{
668+
Mount: &csi.VolumeCapability_MountVolume{
669+
FsType: "btrfs",
670+
MountFlags: []string{
671+
"btrfs-allocation-data-bg_reclaim_threshold=90",
672+
"btrfs-allocation-metadata-bg_reclaim_threshold=91",
673+
},
674+
},
675+
},
676+
AccessMode: &csi.VolumeCapability_AccessMode{
677+
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
678+
},
679+
},
680+
},
681+
deviceSize: 1,
682+
blockExtSize: 1,
683+
readonlyBit: "0",
684+
btrfsReclaimData: "90",
685+
btrfsReclaimMetadata: "91",
686+
},
601687
{
602688
name: "Valid request, update readahead",
603689
req: &csi.NodeStageVolumeRequest{
@@ -730,6 +816,7 @@ func TestNodeStageVolume(t *testing.T) {
730816
t.Logf("Test case: %s", tc.name)
731817
resizeCalled := false
732818
readAheadUpdateCalled := false
819+
blkidCalled := false
733820
actionList := []testingexec.FakeCommandAction{
734821
makeFakeCmd(
735822
&testingexec.FakeCmd{
@@ -853,9 +940,26 @@ func TestNodeStageVolume(t *testing.T) {
853940
),
854941
}...)
855942
}
943+
if tc.btrfsReclaimData != "" || tc.btrfsReclaimMetadata != "" {
944+
actionList = append(actionList, []testingexec.FakeCommandAction{
945+
makeFakeCmd(
946+
&testingexec.FakeCmd{
947+
OutputScript: []testingexec.FakeAction{
948+
func() ([]byte, []byte, error) {
949+
blkidCalled = true
950+
return []byte(btrfsUUID + "\n"), nil, nil
951+
},
952+
},
953+
},
954+
"blkid",
955+
[]string{"--match-tag", "UUID", "--output", "value", stagingPath}...,
956+
),
957+
}...)
958+
}
856959
mounter := mountmanager.NewFakeSafeMounterWithCustomExec(&testingexec.FakeExec{CommandScript: actionList})
857960
gceDriver := getTestGCEDriverWithCustomMounter(t, mounter)
858961
ns := gceDriver.ns
962+
ns.SysfsPath = tempDir + "/sys"
859963
_, err := ns.NodeStageVolume(context.Background(), tc.req)
860964
if err != nil {
861965
serverError, ok := status.FromError(err)
@@ -882,6 +986,30 @@ func TestNodeStageVolume(t *testing.T) {
882986
if tc.expReadAheadUpdate == false && readAheadUpdateCalled == true {
883987
t.Fatalf("Test updated read ahead, but it was not expected.")
884988
}
989+
if tc.btrfsReclaimData == "" && tc.btrfsReclaimMetadata == "" && blkidCalled {
990+
t.Fatalf("blkid was called, but was not expected.")
991+
}
992+
993+
if tc.btrfsReclaimData != "" {
994+
fname := btrfsPrefix + "/data/bg_reclaim_threshold"
995+
got, err := os.ReadFile(fname)
996+
if err != nil {
997+
t.Fatalf("read %q: %v", fname, err)
998+
}
999+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimData {
1000+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimData, s)
1001+
}
1002+
}
1003+
if tc.btrfsReclaimMetadata != "" {
1004+
fname := btrfsPrefix + "/metadata/bg_reclaim_threshold"
1005+
got, err := os.ReadFile(fname)
1006+
if err != nil {
1007+
t.Fatalf("read %q: %v", fname, err)
1008+
}
1009+
if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimMetadata {
1010+
t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimMetadata, s)
1011+
}
1012+
}
8851013
}
8861014
}
8871015

pkg/gce-pd-csi-driver/utils.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ func collectMountOptions(fsType string, mntFlags []string) []string {
311311
// passed directly as an option to the mount command.
312312
continue
313313
}
314+
315+
if btrfsReclaimDataRegex.FindString(opt) != "" {
316+
continue
317+
}
318+
if btrfsReclaimMetadataRegex.FindString(opt) != "" {
319+
continue
320+
}
321+
314322
options = append(options, opt)
315323
}
316324

0 commit comments

Comments
 (0)