Skip to content

Commit 3e7da49

Browse files
committed
Update VerifyDevicePath logic to run udevadm --trigger to fix scenarios where disk is not found or is wrong device
1 parent dd14907 commit 3e7da49

File tree

8 files changed

+301
-55
lines changed

8 files changed

+301
-55
lines changed

Dockerfile

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ ADD . .
2020
ARG TAG
2121
RUN CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-X main.vendorVersion='"${TAG:-latest}"' -extldflags "-static"' -o bin/gce-pd-csi-driver ./cmd/
2222

23+
# MAD HACKS: Build a version first so we can take the scsi_id bin and put it somewhere else in our real build
24+
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 as base
25+
RUN clean-install udev
26+
2327
# Start from Google Debian base
2428
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0
2529
COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver
26-
2730
# Install necessary dependencies
2831
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs
32+
COPY --from=base /lib/udev/scsi_id /lib/udev_containerized/scsi_id
2933

3034
ENTRYPOINT ["/gce-pd-csi-driver"]

cmd/main.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@ import (
2828
mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager"
2929
)
3030

31-
func init() {
32-
flag.Set("logtostderr", "true")
33-
}
34-
3531
var (
3632
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
3733
gceConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
@@ -49,6 +45,7 @@ func init() {
4945
// Use V(5) for GCE Cloud Provider Call informational logging
5046
// Use V(6) for extra repeated/polling information
5147
klog.InitFlags(flag.CommandLine)
48+
flag.Set("logtostderr", "true")
5249
}
5350

5451
func main() {

pkg/gce-pd-csi-driver/node.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ func (ns *GCENodeServer) getDevicePath(volumeID string, partition string) (strin
528528
}
529529

530530
devicePaths := ns.DeviceUtils.GetDiskByIdPaths(deviceName, partition)
531-
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths)
531+
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths, volumeKey.Name)
532532

533533
if err != nil {
534534
return "", fmt.Errorf("error verifying GCE PD (%q) is attached: %v", volumeKey.Name, err)

pkg/mount-manager/device-utils.go

+146-46
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,13 @@ import (
2020
"os/exec"
2121
"path"
2222
"path/filepath"
23+
"regexp"
2324
"strings"
25+
"time"
2426

25-
"k8s.io/apimachinery/pkg/util/sets"
27+
"k8s.io/apimachinery/pkg/util/wait"
2628
"k8s.io/klog"
29+
pathutils "k8s.io/utils/path"
2730
)
2831

2932
const (
@@ -46,6 +49,14 @@ const (
4649
// 'fsck' found errors but exited without correcting them
4750
fsckErrorsUncorrected = 4
4851
defaultMountCommand = "mount"
52+
// scsi_id output should be in the form of:
53+
// 0Google PersistentDisk <disk name>
54+
scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$`
55+
)
56+
57+
var (
58+
// regex to parse scsi_id output and extract the serial
59+
scsiRegex = regexp.MustCompile(scsiPattern)
4960
)
5061

5162
// DeviceUtils are a collection of methods that act on the devices attached
@@ -57,7 +68,7 @@ type DeviceUtils interface {
5768

5869
// VerifyDevicePath returns the first of the list of device paths that
5970
// exists on the machine, or an empty string if none exists
60-
VerifyDevicePath(devicePaths []string) (string, error)
71+
VerifyDevicePath(devicePaths []string, diskName string) (string, error)
6172
}
6273

6374
type deviceUtils struct {
@@ -85,74 +96,163 @@ func (m *deviceUtils) GetDiskByIdPaths(deviceName string, partition string) []st
8596
return devicePaths
8697
}
8798

88-
// Returns the first path that exists, or empty string if none exist.
89-
func (m *deviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
90-
sdBefore, err := filepath.Glob(diskSDPattern)
91-
if err != nil {
92-
// Seeing this error means that the diskSDPattern is malformed.
93-
klog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
99+
func existingDevicePath(devicePaths []string) (string, error) {
100+
for _, devicePath := range devicePaths {
101+
if pathExists, err := pathExists(devicePath); err != nil {
102+
return "", fmt.Errorf("error checking if path exists: %v", err)
103+
} else if pathExists {
104+
return devicePath, nil
105+
}
94106
}
95-
sdBeforeSet := sets.NewString(sdBefore...)
96-
// TODO(#69): Verify udevadm works as intended in driver
97-
if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil {
98-
// udevadm errors should not block disk detachment, log and continue
99-
klog.Errorf("udevadmChangeToNewDrives failed with: %v", err)
107+
return "", nil
108+
}
109+
110+
// getScsiSerial assumes that /lib/udev/scsi_id exists and will error if it
111+
// doesnt. It is the callers responsibility to verify the existence of this
112+
// tool. Calls scsi_id on the given devicePath to get the serial number reported
113+
// by that device.
114+
func getScsiSerial(devicePath string) (string, error) {
115+
out, err := exec.Command(
116+
"/lib/udev_containerized/scsi_id",
117+
"--page=0x83",
118+
"--whitelisted",
119+
fmt.Sprintf("--device=%v", devicePath)).CombinedOutput()
120+
if err != nil {
121+
return "", fmt.Errorf("scsi_id failed for device %q with output %s: %v", devicePath, string(out), err)
100122
}
101123

102-
for _, path := range devicePaths {
103-
if pathExists, err := pathExists(path); err != nil {
104-
return "", fmt.Errorf("Error checking if path exists: %v", err)
105-
} else if pathExists {
106-
return path, nil
107-
}
124+
return parseScsiSerial(string(out))
125+
}
126+
127+
// Parse the output returned by scsi_id and extract the serial number
128+
func parseScsiSerial(output string) (string, error) {
129+
substrings := scsiRegex.FindStringSubmatch(output)
130+
if substrings == nil {
131+
return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output)
108132
}
109133

110-
return "", nil
134+
return substrings[1], nil
111135
}
112136

113-
// Triggers the application of udev rules by calling "udevadm trigger
114-
// --action=change" for newly created "/dev/sd*" drives (exist only in
115-
// after set). This is workaround for Issue #7972. Once the underlying
116-
// issue has been resolved, this may be removed.
117-
func udevadmChangeToNewDrives(sdBeforeSet sets.String) error {
118-
sdAfter, err := filepath.Glob(diskSDPattern)
137+
// VerifyDevicePath returns the first devicePath that maps to a real disk in the
138+
// candidate devicePaths or an empty string if none is found. If
139+
// /lib/udev_containerized/scsi_id exists it will attempt to fix any issues
140+
// caused by missing paths or mismatched devices by running a udevadm --trigger.
141+
func (m *deviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
142+
var devicePath string
143+
var err error
144+
const (
145+
pollInterval = 500 * time.Millisecond
146+
pollTimeout = 3 * time.Second
147+
)
148+
149+
scsiIDPath := "/lib/udev_containerized/scsi_id"
150+
exists, err := pathutils.Exists(pathutils.CheckFollowSymlink, scsiIDPath)
119151
if err != nil {
120-
return fmt.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
152+
return "", fmt.Errorf("failed to check scsi_id existence: %v", err)
153+
}
154+
if !exists {
155+
// No SCSI ID tool, the driver should be containerized with the tool so
156+
// maybe something is wrong with the build process
157+
return "", fmt.Errorf("could not find scsi_id tool at %s, unable to verify device paths", scsiIDPath)
121158
}
122159

123-
for _, sd := range sdAfter {
124-
if !sdBeforeSet.Has(sd) {
125-
return udevadmChangeToDrive(sd)
160+
err = wait.Poll(pollInterval, pollTimeout, func() (bool, error) {
161+
var innerErr error
162+
163+
devicePath, innerErr = existingDevicePath(devicePaths)
164+
if innerErr != nil {
165+
return false, fmt.Errorf("failed to check for existing device path: %v", innerErr)
166+
}
167+
168+
if len(devicePath) == 0 {
169+
// Couldn't find the path so we need to find a /dev/sdx with the SCSI
170+
// serial that matches diskName. Then we run udevadm trigger on that
171+
// device to get the device to show up in /dev/by-id/
172+
innerErr := udevadmTriggerForDiskIfExists(diskName)
173+
if innerErr != nil {
174+
return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr)
175+
}
176+
// Go to next retry loop to get the deviceName again after
177+
// potentially fixing it with the udev command
178+
return false, nil
179+
}
180+
181+
// If there exists a devicePath we make sure disk at /dev/sdx matches the
182+
// expected disk at devicePath by matching SCSI Serial to the disk name
183+
devSDX, innerErr := filepath.EvalSymlinks(devicePath)
184+
if innerErr != nil {
185+
return false, fmt.Errorf("filepath.EvalSymlinks(%q) failed with %v", devicePath, innerErr)
186+
}
187+
// Check to make sure device path maps to the correct disk
188+
if strings.Contains(devSDX, diskSDPath) {
189+
scsiSerial, innerErr := getScsiSerial(devSDX)
190+
if innerErr != nil {
191+
return false, fmt.Errorf("couldn't get SCSI serial number for disk %s: %v", diskName, innerErr)
192+
}
193+
// SUCCESS! devicePath points to a /dev/sdx that has a SCSI serial
194+
// equivilant to our disk name
195+
if scsiSerial == diskName {
196+
return true, nil
197+
}
126198
}
199+
// The devicePath is not mapped to the correct disk
200+
innerErr = udevadmTriggerForDiskIfExists(diskName)
201+
if innerErr != nil {
202+
return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr)
203+
}
204+
// Go to next retry loop to get the deviceName again after
205+
// potentially fixing it with the udev command
206+
return false, nil
207+
})
208+
209+
if err != nil {
210+
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %v", diskName, pollTimeout, err)
127211
}
128212

129-
return nil
213+
return devicePath, nil
130214
}
131215

132-
// Calls "udevadm trigger --action=change" on the specified drive.
133-
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
134-
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
135-
func udevadmChangeToDrive(drivePath string) error {
136-
klog.V(4).Infof("udevadmChangeToDrive: drive=%q", drivePath)
137-
138-
// Evaluate symlink, if any
139-
drive, err := filepath.EvalSymlinks(drivePath)
216+
func udevadmTriggerForDiskIfExists(diskName string) error {
217+
sds, err := filepath.Glob(diskSDPattern)
140218
if err != nil {
141-
return fmt.Errorf("udevadmChangeToDrive: filepath.EvalSymlinks(%q) failed with %v.", drivePath, err)
219+
return fmt.Errorf("failed to filepath.Glob(\"%s\"): %v", diskSDPattern, err)
142220
}
143-
// Check to make sure input is "/dev/sd*"
144-
if !strings.Contains(drive, diskSDPath) {
145-
return fmt.Errorf("udevadmChangeToDrive: expected input in the form \"%s\" but drive is %q.", diskSDPattern, drive)
221+
for _, devSDX := range sds {
222+
scsiSerial, err := getScsiSerial(devSDX)
223+
if err != nil {
224+
return fmt.Errorf("failed to get SCSI Serial num for %s: %v", devSDX, err)
225+
}
226+
if scsiSerial == diskName {
227+
// Found the disk that we're looking for so run a trigger on it
228+
// to resolve its /dev/by-id/ path
229+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has SCSI ID %s", devSDX, scsiSerial)
230+
err := udevadmChangeToDrive(devSDX)
231+
if err != nil {
232+
return fmt.Errorf("failed to fix disk at path %s which has SCSI ID %s: %v", devSDX, scsiSerial, err)
233+
}
234+
return nil
235+
}
146236
}
237+
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in %v", diskName, sds)
238+
}
147239

240+
// Calls "udevadm trigger --action=change" on the specified drive. drivePath
241+
// must be the block device path to trigger on, in the format "/dev/sd*", or a
242+
// symlink to it. This is workaround for Issue #7972. Once the underlying issue
243+
// has been resolved, this may be removed.
244+
// udevadm takes a little bit to work its magic in the background so any callers
245+
// should not expect the trigger to complete instantly and may need to poll for
246+
// the change
247+
func udevadmChangeToDrive(devSDX string) error {
148248
// Call "udevadm trigger --action=change --property-match=DEVNAME=/dev/sd..."
149-
_, err = exec.Command(
249+
out, err := exec.Command(
150250
"udevadm",
151251
"trigger",
152252
"--action=change",
153-
fmt.Sprintf("--property-match=DEVNAME=%s", drive)).CombinedOutput()
253+
fmt.Sprintf("--property-match=DEVNAME=%s", devSDX)).CombinedOutput()
154254
if err != nil {
155-
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with %v.", drive, err)
255+
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with output %s: %v.", devSDX, string(out), err)
156256
}
157257
return nil
158258
}

pkg/mount-manager/fake-device-utils.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func (m *fakeDeviceUtils) GetDiskByIdPaths(pdName string, partition string) []st
3030
}
3131

3232
// Returns the first path that exists, or empty string if none exist.
33-
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
33+
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
3434
// Return any random device path to use as mount source
3535
return "/dev/disk/fake-path", nil
3636
}

test/e2e/tests/setup_e2e_test.go

+10
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,16 @@ var _ = BeforeSuite(func() {
8989
klog.Fatalf("Failed to setup instance %v: %v", nodeID, err)
9090
}
9191

92+
err = testutils.MkdirAll(i, "/lib/udev_containerized")
93+
if err != nil {
94+
klog.Fatalf("Could not make scsi_id containerized directory: %v", err)
95+
}
96+
97+
err = testutils.CopyFile(i, "/lib/udev/scsi_id", "/lib/udev_containerized/scsi_id")
98+
if err != nil {
99+
klog.Fatalf("could not copy scsi_id to containerized directory: %v", err)
100+
}
101+
92102
klog.Infof("Creating new driver and client for node %s\n", i.GetName())
93103
// Create new driver and client
94104
testContext, err := testutils.GCEClientAndDriverSetup(i)

0 commit comments

Comments
 (0)