Skip to content

Commit e3fc317

Browse files
committed
Set expiration for keys in deviceInUse cache. Add unit test.
1 parent 834b272 commit e3fc317

File tree

14 files changed

+409
-966
lines changed

14 files changed

+409
-966
lines changed

pkg/gce-pd-csi-driver/device_error_map.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ limitations under the License.
1515
package gceGCEDriver
1616

1717
import (
18-
"fmt"
1918
"sync"
2019
"time"
2120

22-
lru "github.com/hashicorp/golang-lru/v2"
21+
"github.com/hashicorp/golang-lru/v2/expirable"
2322
"k8s.io/klog/v2"
2423
)
2524

2625
// maxDeviceCacheSize specifies the maximum number if in-use devices to cache.
26+
// 256 was selected since it is twice the number of max PDs per VM (128)
2727
const maxDeviceCacheSize = 256
2828

2929
// currentTime is used to stub time.Now in unit tests
@@ -34,14 +34,11 @@ var currentTime = time.Now
3434
type deviceErrMap struct {
3535
timeout time.Duration
3636
mux sync.Mutex
37-
cache *lru.Cache[string, time.Time]
37+
cache *expirable.LRU[string, time.Time]
3838
}
3939

4040
func newDeviceErrMap(timeout time.Duration) *deviceErrMap {
41-
c, err := lru.New[string, time.Time](maxDeviceCacheSize)
42-
if err != nil {
43-
panic(fmt.Sprintf("Could not initialize deviceInUse LRU cache: %s", err))
44-
}
41+
c := expirable.NewLRU[string, time.Time](maxDeviceCacheSize, nil, timeout*2)
4542

4643
return &deviceErrMap{
4744
cache: c,
@@ -55,8 +52,12 @@ func (devErrMap *deviceErrMap) checkDeviceErrorTimeout(deviceName string) bool {
5552
devErrMap.mux.Lock()
5653
defer devErrMap.mux.Unlock()
5754

58-
lastErrTime, exists := devErrMap.cache.Get(deviceName)
59-
return exists && currentTime().Sub(lastErrTime).Seconds() >= devErrMap.timeout.Seconds()
55+
firstEncounteredErrTime, exists := devErrMap.cache.Get(deviceName)
56+
if !exists {
57+
return false
58+
}
59+
expirationTime := firstEncounteredErrTime.Add(devErrMap.timeout)
60+
return currentTime().After(expirationTime)
6061
}
6162

6263
// markDeviceError updates the internal `cache` map to denote an error was encounted
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License.
13+
*/
14+
15+
package gceGCEDriver
16+
17+
import (
18+
"testing"
19+
"time"
20+
)
21+
22+
func TestDeviceErrorMap(t *testing.T) {
23+
timeout := time.Second * 10
24+
dName := "fake-device"
25+
eMap := newDeviceErrMap(timeout)
26+
defer func() { currentTime = time.Now }()
27+
28+
// Register an error. Checking the timeout right after should return false
29+
stubCurrentTime(0)
30+
eMap.markDeviceError(dName)
31+
isTimedOut := eMap.checkDeviceErrorTimeout(dName)
32+
if isTimedOut {
33+
t.Errorf("checkDeviceErrorTimeout expected to be false if called immediately after marking an error")
34+
}
35+
36+
// Advance time. Checking the timeout should now return true
37+
stubCurrentTime(int64(timeout.Seconds()) + 1)
38+
isTimedOut = eMap.checkDeviceErrorTimeout(dName)
39+
if !isTimedOut {
40+
t.Errorf("checkDeviceErrorTimeout expected to be true after waiting for timeout")
41+
}
42+
}
43+
44+
func stubCurrentTime(unixTime int64) {
45+
currentTime = func() time.Time {
46+
return time.Unix(unixTime, 0)
47+
}
48+
}

pkg/gce-pd-csi-driver/node.go

+12-14
Original file line numberDiff line numberDiff line change
@@ -471,19 +471,21 @@ func (ns *GCENodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUns
471471
return nil, status.Error(codes.Internal, fmt.Sprintf("NodeUnstageVolume failed: %v\nUnmounting arguments: %s\n", err.Error(), stagingTargetPath))
472472
}
473473

474-
if err := ns.confirmDeviceUnused(volumeID); err != nil {
475-
var ignoreableErr *ignoreableError
476-
if errors.As(err, &ignoreableErr) {
477-
klog.Warningf("Unabled to check if device for %s is unused. Device has been unmounted successfully. Ignoring and continuing with unstaging. (%v)", volumeID, err)
478-
} else if ns.deviceInUseErrors.checkDeviceErrorTimeout(volumeID) {
479-
klog.Warningf("Device %s could not be released after timeout of %f seconds. NodeUnstageVolume will return success.", volumeID, ns.deviceInUseErrors.timeout.Seconds())
480-
} else {
481-
ns.deviceInUseErrors.markDeviceError(volumeID)
482-
return nil, status.Errorf(codes.Internal, "NodeUnstageVolume for volume %s failed: %v", volumeID, err)
474+
if ns.enableDeviceInUseCheck {
475+
if err := ns.confirmDeviceUnused(volumeID); err != nil {
476+
var ignoreableErr *ignoreableError
477+
if errors.As(err, &ignoreableErr) {
478+
klog.Warningf("Unabled to check if device for %s is unused. Device has been unmounted successfully. Ignoring and continuing with unstaging. (%v)", volumeID, err)
479+
} else if ns.deviceInUseErrors.checkDeviceErrorTimeout(volumeID) {
480+
klog.Warningf("Device %s could not be released after timeout of %f seconds. NodeUnstageVolume will return success.", volumeID, ns.deviceInUseErrors.timeout.Seconds())
481+
} else {
482+
ns.deviceInUseErrors.markDeviceError(volumeID)
483+
return nil, status.Errorf(codes.Internal, "NodeUnstageVolume for volume %s failed: %v", volumeID, err)
484+
}
483485
}
486+
ns.deviceInUseErrors.deleteDevice(volumeID)
484487
}
485488

486-
ns.deviceInUseErrors.deleteDevice(volumeID)
487489
klog.V(4).Infof("NodeUnstageVolume succeeded on %v from %s", volumeID, stagingTargetPath)
488490
return &csi.NodeUnstageVolumeResponse{}, nil
489491
}
@@ -492,10 +494,6 @@ func (ns *GCENodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUns
492494
type ignoreableError struct{ error }
493495

494496
func (ns *GCENodeServer) confirmDeviceUnused(volumeID string) error {
495-
if !ns.enableDeviceInUseCheck {
496-
return nil
497-
}
498-
499497
devicePath, err := getDevicePath(ns, volumeID, "" /* partition, which is unused */)
500498
if err != nil {
501499
return &ignoreableError{fmt.Errorf("failed to find device path for volume %s: %v", volumeID, err.Error())}

vendor/github.com/hashicorp/golang-lru/v2/.gitignore

-23
This file was deleted.

vendor/github.com/hashicorp/golang-lru/v2/.golangci.yml

-46
This file was deleted.

0 commit comments

Comments
 (0)