Skip to content

Commit 0286d08

Browse files
committed
Implementing watcher & reboot stability for data cache to master branch.
1 parent 39a5910 commit 0286d08

35 files changed

+4665
-26
lines changed

cmd/gce-pd-csi-driver/main.go

+40-3
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ const (
105105
driverName = "pd.csi.storage.gke.io"
106106
dataCacheLabel = "datacache-storage-gke-io"
107107
dataCacheLabelValue = "enabled"
108+
raidedLocalSsdName = "csi-driver-data-cache"
109+
raidedLssdPrefix = "/dev/md"
108110
)
109111

112+
var raidedLocalSsdPath = raidedLssdPrefix + raidedLocalSsdName
113+
110114
func init() {
111115
// klog verbosity guide for this package
112116
// Use V(2) for one time config information
@@ -266,9 +270,10 @@ func handle() {
266270
if nodeName == nil || *nodeName == "" {
267271
klog.Errorf("Data cache enabled, but --node-name not passed")
268272
}
269-
if err := setupDataCache(ctx, *nodeName); err != nil {
273+
if err := setupDataCache(ctx, *nodeName, nodeServer.MetadataService.GetName()); err != nil {
270274
klog.Errorf("DataCache setup failed: %v", err)
271275
}
276+
go driver.StartWatcher(*nodeName)
272277
}
273278

274279
err = gceDriver.SetupGCEDriver(driverName, version, extraVolumeLabels, extraTags, identityServer, controllerServer, nodeServer)
@@ -350,7 +355,7 @@ func urlFlag(target **url.URL, name string, usage string) {
350355
})
351356
}
352357

353-
func setupDataCache(ctx context.Context, nodeName string) error {
358+
func setupDataCache(ctx context.Context, nodeName string, nodeId string) error {
354359
klog.V(2).Infof("Setting up data cache for node %s", nodeName)
355360
if nodeName != common.TestNode {
356361
cfg, err := rest.InClusterConfig()
@@ -373,7 +378,39 @@ func setupDataCache(ctx context.Context, nodeName string) error {
373378
}
374379
klog.V(2).Info("Raiding local ssds to setup data cache")
375380
if err := driver.RaidLocalSsds(); err != nil {
376-
return fmt.Errorf("Failed to Raid local SSDs, unable to setup data caching, got error %v", err)
381+
return fmt.Errorf("failed to Raid local SSDs, unable to setup data caching, got error %v", err)
382+
}
383+
384+
// vgcreate with the raided local ssds.
385+
info, err := common.RunCommand("grep", raidedLocalSsdName, "ls", raidedLssdPrefix)
386+
if err != nil {
387+
return fmt.Errorf("failed while listing raided devices, err: %v, output: %v", err, info)
388+
}
389+
infoString := strings.TrimSpace(string(info))
390+
klog.V(2).Infof("Got Raided LSSD name %v", infoString)
391+
raidedLocalSsdPath = raidedLssdPrefix + infoString
392+
volumeGroupName := driver.GetVolumeGroupName(nodeId)
393+
394+
klog.V(2).Infof("vgscan before vgcreate")
395+
vgExists := driver.CheckVgExists(volumeGroupName)
396+
klog.V(2).Infof("vgscan info contains volumeGroupName or not %v", vgExists)
397+
// Check if the required volume group already exists
398+
if vgExists {
399+
klog.V(2).Infof("VG exists, now check if PD is part of VG")
400+
401+
// Clean up Volume Group before adding the PD
402+
driver.ReduceVolumeGroup(volumeGroupName, true)
403+
404+
// validate that raidedLSSD is part of VG
405+
err = driver.ValidateRaidedLSSDinVG(volumeGroupName)
406+
if err != nil {
407+
return fmt.Errorf("validateRaidedLSSDinVG error: %v", err)
408+
}
409+
} else {
410+
err := driver.CreateVg(volumeGroupName, raidedLocalSsdPath)
411+
if err != nil {
412+
return err
413+
}
377414
}
378415

379416
klog.V(2).Infof("Datacache enabled for node %s", nodeName)

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ require (
5858
github.com/davecgh/go-spew v1.1.1 // indirect
5959
github.com/emicklei/go-restful v2.9.5+incompatible // indirect
6060
github.com/felixge/httpsnoop v1.0.4 // indirect
61-
github.com/fsnotify/fsnotify v1.5.4 // indirect
61+
github.com/fsnotify/fsnotify v1.8.0 // indirect
6262
github.com/go-logr/logr v1.4.2 // indirect
6363
github.com/go-logr/stdr v1.2.2 // indirect
6464
github.com/go-openapi/jsonpointer v0.20.0 // indirect

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,8 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4
10321032
github.com/fsnotify/fsnotify v1.5.1/go.mod h1:T3375wBYaZdLLcVNkcVbzGHY7f1l/uK5T5Ai1i3InKU=
10331033
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
10341034
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
1035+
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
1036+
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
10351037
github.com/fsouza/fake-gcs-server v0.0.0-20180612165233-e85be23bdaa8/go.mod h1:1/HufuJ+eaDf4KTnYdS6HJMGvMRU8d4cYTuu/1QaBbI=
10361038
github.com/fsouza/fake-gcs-server v1.19.4/go.mod h1:I0/88nHCASqJJ5M7zVF0zKODkYTcuXFW5J5yajsNJnE=
10371039
github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=

pkg/gce-pd-csi-driver/cache.go

+72-20
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"strings"
88

99
csi "github.com/container-storage-interface/spec/lib/go/csi"
10+
fsnotify "github.com/fsnotify/fsnotify"
1011

1112
"k8s.io/klog/v2"
1213

@@ -25,7 +26,7 @@ var raidedLocalSsdPath = raidedLssdPrefix + raidedLocalSsdName
2526

2627
func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId string) (string, error) {
2728
volumeId := req.GetVolumeId()
28-
volumeGroupName := getVolumeGroupName(nodeId)
29+
volumeGroupName := GetVolumeGroupName(nodeId)
2930
mainDevicePath := "/dev/" + volumeGroupName + "/" + getLvName(mainLvSuffix, volumeId)
3031
mainLvName := getLvName(mainLvSuffix, volumeId)
3132
klog.V(2).Infof("Volume group available on node %v ", volumeGroupName)
@@ -37,17 +38,6 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
3738
infoString := strings.TrimSpace(string(info))
3839
raidedLocalSsdPath = raidedLssdPrefix + infoString
3940

40-
vgExists := checkVgExists(volumeGroupName)
41-
if vgExists {
42-
// Clean up Volume Group before adding the PD
43-
reduceVolumeGroup(volumeGroupName, true)
44-
} else {
45-
err := createVg(volumeGroupName, devicePath, raidedLocalSsdPath)
46-
if err != nil {
47-
return mainDevicePath, err
48-
}
49-
}
50-
5141
// Check if the Physical Volume(PV) is part of some other volume group
5242
args := []string{
5343
"--select",
@@ -75,7 +65,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
7565
klog.Errorf("Errored while deactivating VG %v: err: %v: %s", vgNameForPv, err, info)
7666
}
7767
// CLean up volume group to remove any dangling PV refrences
78-
reduceVolumeGroup(vgNameForPv, false)
68+
ReduceVolumeGroup(vgNameForPv, false)
7969
_, isCached := isCachingSetup(mainLvName)
8070
// We will continue to uncache even if it errors to check caching as it is not a terminal issue.
8171
if isCached {
@@ -91,7 +81,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
9181
return "", fmt.Errorf("errored while uncaching main LV. %v: %s", err, info)
9282
}
9383
// CLean up volume group to remove any dangling PV refrences
94-
reduceVolumeGroup(vgNameForPv, false)
84+
ReduceVolumeGroup(vgNameForPv, false)
9585
}
9686
info, err = common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgmerge", []string{volumeGroupName, vgNameForPv}...)
9787
if err != nil {
@@ -189,7 +179,7 @@ func setupCaching(devicePath string, req *csi.NodeStageVolumeRequest, nodeId str
189179
return mainDevicePath, nil
190180
}
191181

192-
func checkVgExists(volumeGroupName string) bool {
182+
func CheckVgExists(volumeGroupName string) bool {
193183
args := []string{}
194184
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgscan", args...)
195185
if err != nil {
@@ -202,8 +192,8 @@ func checkVgExists(volumeGroupName string) bool {
202192

203193
func cleanupCache(volumeId string, nodeId string) error {
204194

205-
volumeGroupName := getVolumeGroupName(nodeId)
206-
if !checkVgExists(volumeGroupName) {
195+
volumeGroupName := GetVolumeGroupName(nodeId)
196+
if !CheckVgExists(volumeGroupName) {
207197
// If volume group doesn't exist then there's nothing to uncache
208198
return nil
209199
}
@@ -228,7 +218,7 @@ func cleanupCache(volumeId string, nodeId string) error {
228218
return nil
229219
}
230220

231-
func getVolumeGroupName(nodePath string) string {
221+
func GetVolumeGroupName(nodePath string) string {
232222
nodeSlice := strings.Split(nodePath, "/")
233223
nodeId := nodeSlice[len(nodeSlice)-1]
234224
nodeHash := common.ShortString(nodeId)
@@ -241,7 +231,7 @@ func getLvName(suffix string, volumeId string) string {
241231
return fmt.Sprintf("%s-%s", suffix, pvcName)
242232
}
243233

244-
func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string) error {
234+
func CreateVg(volumeGroupName string, raidedLocalSsds string) error {
245235
args := []string{
246236
"--zero",
247237
"y",
@@ -263,7 +253,7 @@ func createVg(volumeGroupName string, devicePath string, raidedLocalSsds string)
263253
return nil
264254
}
265255

266-
func reduceVolumeGroup(volumeGroupName string, force bool) {
256+
func ReduceVolumeGroup(volumeGroupName string, force bool) {
267257
args := []string{
268258
"--removemissing",
269259
volumeGroupName,
@@ -366,3 +356,65 @@ func isCachingSetup(mainLvName string) (error, bool) {
366356
}
367357
return nil, false
368358
}
359+
360+
func StartWatcher(nodeName string) {
361+
dirToWatch := "/dev/"
362+
watcher, err := fsnotify.NewWatcher()
363+
if err != nil {
364+
klog.V(2).ErrorS(err, "errored while creating watcher")
365+
}
366+
defer watcher.Close()
367+
368+
// out of the box fsnotify can watch a single file, or a single directory
369+
if err := watcher.Add(dirToWatch); err != nil {
370+
klog.V(2).ErrorS(err, "errored while adding watcher directory")
371+
}
372+
errorCh := make(chan error, 1)
373+
// Handle the error received from the watcher goroutine
374+
go watchDiskDetaches(watcher, nodeName, errorCh)
375+
376+
select {
377+
case err := <-errorCh:
378+
klog.Errorf("watcher encountered an error: %v", err)
379+
}
380+
}
381+
382+
func watchDiskDetaches(watcher *fsnotify.Watcher, nodeName string, errorCh chan error) error {
383+
for {
384+
select {
385+
// watch for errors
386+
case err := <-watcher.Errors:
387+
errorCh <- fmt.Errorf("disk update event errored: %v", err)
388+
// watch for events
389+
case event := <-watcher.Events:
390+
// In case of an event i.e. creation or deletion of any new PV, we update the VG metadata.
391+
// This might include some non-LVM changes, no harm in updating metadata multiple times.
392+
ReduceVolumeGroup(GetVolumeGroupName(nodeName), true)
393+
klog.V(2).Infof("disk attach/detach event %#v\n", event)
394+
}
395+
}
396+
}
397+
398+
func ValidateRaidedLSSDinVG(vgName string) error {
399+
args := []string{
400+
"--noheadings",
401+
"-o",
402+
"pv_name",
403+
"--select",
404+
"vg_name=" + vgName,
405+
}
406+
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "pvs", args...)
407+
if err != nil {
408+
return fmt.Errorf("errored while checking physical volume details %v: %s", err, info)
409+
// On error info contains the error message which we cannot use for further steps
410+
}
411+
412+
klog.V(2).Infof("Got PVs %v in VG %v", strings.TrimSpace(string(info)), vgName)
413+
if !strings.Contains(string(info), "/dev/md127") {
414+
info, err := common.RunCommand("" /* pipedCmd */, "" /* pipedCmdArg */, "vgextend", []string{vgName, "/dev/md127"}...)
415+
if err != nil {
416+
klog.Errorf("errored while extending VGs %v: %s", err, info)
417+
}
418+
}
419+
return nil
420+
}

vendor/github.com/fsnotify/fsnotify/.cirrus.yml

+14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/fsnotify/fsnotify/.gitignore

+10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/fsnotify/fsnotify/.mailmap

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)