Skip to content

Commit 4bd3d05

Browse files
committed
Update VerifyDevicePath logic to run udevadm --trigger to fix scenarios where disk is not found or is wrong device
1 parent dd14907 commit 4bd3d05

File tree

7 files changed

+216
-55
lines changed

7 files changed

+216
-55
lines changed

Dockerfile

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ ADD . .
2020
ARG TAG
2121
RUN CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-X main.vendorVersion='"${TAG:-latest}"' -extldflags "-static"' -o bin/gce-pd-csi-driver ./cmd/
2222

23+
# MAD HACKS: Build a version first so we can take the scsi_id bin and put it somewhere else in our real build
24+
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 as base
25+
RUN clean-install udev
26+
2327
# Start from Google Debian base
2428
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0
2529
COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver
26-
2730
# Install necessary dependencies
2831
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs
32+
COPY --from=base /lib/udev/scsi_id /lib/udev_containerized/scsi_id
2933

3034
ENTRYPOINT ["/gce-pd-csi-driver"]

cmd/main.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@ import (
2828
mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager"
2929
)
3030

31-
func init() {
32-
flag.Set("logtostderr", "true")
33-
}
34-
3531
var (
3632
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
3733
gceConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
@@ -49,6 +45,7 @@ func init() {
4945
// Use V(5) for GCE Cloud Provider Call informational logging
5046
// Use V(6) for extra repeated/polling information
5147
klog.InitFlags(flag.CommandLine)
48+
flag.Set("logtostderr", "true")
5249
}
5350

5451
func main() {

pkg/gce-pd-csi-driver/node.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ func (ns *GCENodeServer) getDevicePath(volumeID string, partition string) (strin
528528
}
529529

530530
devicePaths := ns.DeviceUtils.GetDiskByIdPaths(deviceName, partition)
531-
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths)
531+
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths, volumeKey.Name)
532532

533533
if err != nil {
534534
return "", fmt.Errorf("error verifying GCE PD (%q) is attached: %v", volumeKey.Name, err)

pkg/mount-manager/device-utils.go

+125-46
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import (
2020
"os/exec"
2121
"path"
2222
"path/filepath"
23+
"regexp"
2324
"strings"
25+
"time"
2426

25-
"k8s.io/apimachinery/pkg/util/sets"
2627
"k8s.io/klog"
28+
pathutils "k8s.io/utils/path"
2729
)
2830

2931
const (
@@ -46,6 +48,14 @@ const (
4648
// 'fsck' found errors but exited without correcting them
4749
fsckErrorsUncorrected = 4
4850
defaultMountCommand = "mount"
51+
// scsi_id output should be in the form of:
52+
// 0Google PersistentDisk <disk name>
53+
scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$`
54+
)
55+
56+
var (
57+
// regex to parse scsi_id output and extract the serial
58+
scsiRegex = regexp.MustCompile(scsiPattern)
4959
)
5060

5161
// DeviceUtils are a collection of methods that act on the devices attached
@@ -57,7 +67,7 @@ type DeviceUtils interface {
5767

5868
// VerifyDevicePath returns the first of the list of device paths that
5969
// exists on the machine, or an empty string if none exists
60-
VerifyDevicePath(devicePaths []string) (string, error)
70+
VerifyDevicePath(devicePaths []string, diskName string) (string, error)
6171
}
6272

6373
type deviceUtils struct {
@@ -85,75 +95,144 @@ func (m *deviceUtils) GetDiskByIdPaths(deviceName string, partition string) []st
8595
return devicePaths
8696
}
8797

88-
// Returns the first path that exists, or empty string if none exist.
89-
func (m *deviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
90-
sdBefore, err := filepath.Glob(diskSDPattern)
91-
if err != nil {
92-
// Seeing this error means that the diskSDPattern is malformed.
93-
klog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
94-
}
95-
sdBeforeSet := sets.NewString(sdBefore...)
96-
// TODO(#69): Verify udevadm works as intended in driver
97-
if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil {
98-
// udevadm errors should not block disk detachment, log and continue
99-
klog.Errorf("udevadmChangeToNewDrives failed with: %v", err)
100-
}
101-
102-
for _, path := range devicePaths {
103-
if pathExists, err := pathExists(path); err != nil {
104-
return "", fmt.Errorf("Error checking if path exists: %v", err)
98+
func existingDevicePath(devicePaths []string) (string, error) {
99+
for _, devicePath := range devicePaths {
100+
if pathExists, err := pathExists(devicePath); err != nil {
101+
return "", fmt.Errorf("error checking if path exists: %v", err)
105102
} else if pathExists {
106-
return path, nil
103+
return devicePath, nil
107104
}
108105
}
109-
110106
return "", nil
111107
}
112108

113-
// Triggers the application of udev rules by calling "udevadm trigger
114-
// --action=change" for newly created "/dev/sd*" drives (exist only in
115-
// after set). This is workaround for Issue #7972. Once the underlying
116-
// issue has been resolved, this may be removed.
117-
func udevadmChangeToNewDrives(sdBeforeSet sets.String) error {
118-
sdAfter, err := filepath.Glob(diskSDPattern)
109+
// getScsiSerial assumes that /lib/udev/scsi_id exists and will error if it
110+
// doesnt. It is the callers responsibility to verify the existence of this
111+
// tool. Calls scsi_id on the given devicePath to get the serial number reported
112+
// by that device.
113+
func getScsiSerial(devicePath string) (string, error) {
114+
out, err := exec.Command(
115+
"/lib/udev_containerized/scsi_id",
116+
"--page=0x83",
117+
"--whitelisted",
118+
fmt.Sprintf("--device=%v", devicePath)).CombinedOutput()
119119
if err != nil {
120-
return fmt.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
120+
return "", fmt.Errorf("scsi_id failed for device %q with output %s: %v", devicePath, string(out), err)
121121
}
122122

123-
for _, sd := range sdAfter {
124-
if !sdBeforeSet.Has(sd) {
125-
return udevadmChangeToDrive(sd)
126-
}
123+
return parseScsiSerial(string(out))
124+
}
125+
126+
// Parse the output returned by scsi_id and extract the serial number
127+
func parseScsiSerial(output string) (string, error) {
128+
substrings := scsiRegex.FindStringSubmatch(output)
129+
if substrings == nil {
130+
return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output)
127131
}
128132

129-
return nil
133+
return substrings[1], nil
130134
}
131135

132-
// Calls "udevadm trigger --action=change" on the specified drive.
133-
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
134-
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
135-
func udevadmChangeToDrive(drivePath string) error {
136-
klog.V(4).Infof("udevadmChangeToDrive: drive=%q", drivePath)
136+
// VerifyDevicePath returns the first devicePath that maps to a real disk in the
137+
// candidate devicePaths or an empty string if none is found. If
138+
// /lib/udev/scsi_id exists it will attempt to fix any issues caused by missing
139+
// paths or mismatched devices by running a udevadm --trigger.
140+
func (m *deviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
141+
devicePath, err := existingDevicePath(devicePaths)
142+
if err != nil {
143+
return "", fmt.Errorf("failed to check for existing device path: %v", err)
144+
}
137145

138-
// Evaluate symlink, if any
139-
drive, err := filepath.EvalSymlinks(drivePath)
146+
scsiIDPath := "/lib/udev_containerized/scsi_id"
147+
exists, err := pathutils.Exists(pathutils.CheckFollowSymlink, scsiIDPath)
140148
if err != nil {
141-
return fmt.Errorf("udevadmChangeToDrive: filepath.EvalSymlinks(%q) failed with %v.", drivePath, err)
149+
return "", fmt.Errorf("failed to check scsi_id existence: %v", err)
150+
}
151+
if !exists {
152+
// No SCSI ID tool, no udevadm fixing is possible so just return best
153+
// effort devicePath.
154+
klog.Warningf("Could not find scsi_id tool at %s, no udevadm fixing is possible so we just return the best effort devicePath", scsiIDPath)
155+
return devicePath, nil
156+
}
157+
158+
if len(devicePath) == 0 {
159+
// Couldn't find the path so we need to find a /dev/sdx with the SCSI
160+
// serial that matches diskName. Then we run udevadm trigger on that
161+
// device to get the device to show up in /dev/by-id/
162+
err := udevadmTriggerForDiskIfExists(diskName)
163+
if err != nil {
164+
return "", fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, err)
165+
}
166+
// Try get the deviceName again after potentially fixing it with the
167+
// udev command
168+
return existingDevicePath(devicePaths)
169+
}
170+
171+
// If there exists a devicePath we make sure disk at /dev/sdx matches the
172+
// expected disk at devicePath by matching SCSI Serial to the disk name
173+
devSDX, err := filepath.EvalSymlinks(devicePath)
174+
if err != nil {
175+
return "", fmt.Errorf("filepath.EvalSymlinks(%q) failed with %v.", devicePath, err)
142176
}
143177
// Check to make sure input is "/dev/sd*"
144-
if !strings.Contains(drive, diskSDPath) {
145-
return fmt.Errorf("udevadmChangeToDrive: expected input in the form \"%s\" but drive is %q.", diskSDPattern, drive)
178+
if !strings.Contains(devSDX, diskSDPath) {
179+
return "", fmt.Errorf("expected input in the form \"%s\" but drive is %q.", diskSDPattern, devSDX)
180+
}
181+
scsiSerial, err := getScsiSerial(devSDX)
182+
if err != nil {
183+
return "", fmt.Errorf("couldn't get SCSI serial number for disk %s: %v", diskName, err)
146184
}
185+
if scsiSerial != diskName {
186+
err := udevadmTriggerForDiskIfExists(diskName)
187+
if err != nil {
188+
return "", fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, err)
189+
}
190+
}
191+
return devicePath, nil
192+
}
147193

194+
func udevadmTriggerForDiskIfExists(diskName string) error {
195+
sds, err := filepath.Glob(diskSDPattern)
196+
if err != nil {
197+
return fmt.Errorf("failed to filepath.Glob(\"%s\"): %v", diskSDPattern, err)
198+
}
199+
for _, devSDX := range sds {
200+
scsiSerial, err := getScsiSerial(devSDX)
201+
if err != nil {
202+
return fmt.Errorf("failed to get SCSI Serial num for %s: %v", devSDX, err)
203+
}
204+
if scsiSerial == diskName {
205+
// Found the disk that we're looking for so run a trigger on it
206+
// to resolve its /dev/by-id/ path
207+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has SCSI ID %s", devSDX, scsiSerial)
208+
err := udevadmChangeToDrive(devSDX)
209+
if err != nil {
210+
return fmt.Errorf("failed to fix disk at path %s which has SCSI ID %s: %v", devSDX, scsiSerial, err)
211+
}
212+
return nil
213+
}
214+
}
215+
klog.Warningf("udevadm --trigger requested to fix disk %s but no such disk was found in %v", diskName, sds)
216+
return nil
217+
}
218+
219+
// Calls "udevadm trigger --action=change" on the specified drive.
220+
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
221+
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
222+
func udevadmChangeToDrive(devSDX string) error {
148223
// Call "udevadm trigger --action=change --property-match=DEVNAME=/dev/sd..."
149-
_, err = exec.Command(
224+
out, err := exec.Command(
150225
"udevadm",
151226
"trigger",
152227
"--action=change",
153-
fmt.Sprintf("--property-match=DEVNAME=%s", drive)).CombinedOutput()
228+
fmt.Sprintf("--property-match=DEVNAME=%s", devSDX)).CombinedOutput()
154229
if err != nil {
155-
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with %v.", drive, err)
230+
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with output %s: %v.", devSDX, string(out), err)
156231
}
232+
// udevadm takes a little bit to work its magic in the background - since
233+
// this fix only gets called when something is wrong and we *need* the
234+
// udevadm we wait a second or two before returning a successful udevadm
235+
time.Sleep(time.Second)
157236
return nil
158237
}
159238

pkg/mount-manager/fake-device-utils.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func (m *fakeDeviceUtils) GetDiskByIdPaths(pdName string, partition string) []st
3030
}
3131

3232
// Returns the first path that exists, or empty string if none exist.
33-
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
33+
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
3434
// Return any random device path to use as mount source
3535
return "/dev/disk/fake-path", nil
3636
}

test/e2e/tests/single_zone_e2e_test.go

+65
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ package tests
1717
import (
1818
"context"
1919
"fmt"
20+
"path/filepath"
2021
"strings"
2122
"time"
2223

2324
"k8s.io/apimachinery/pkg/util/uuid"
2425
"k8s.io/apimachinery/pkg/util/wait"
26+
"k8s.io/klog"
2527
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2628
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
29+
testutils "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/e2e/utils"
2730
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/remote"
2831

2932
csi "github.com/container-storage-interface/spec/lib/go/csi"
@@ -81,7 +84,69 @@ var _ = Describe("GCE PD CSI Driver", func() {
8184
// Attach Disk
8285
err := testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */)
8386
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")
87+
})
88+
89+
It("Should automatically fix symlink errors between /dev/sdx and /dev/by-id if disk is not found or incorrect", func() {
90+
testContext := getRandomTestContext()
91+
92+
p, z, _ := testContext.Instance.GetIdentity()
93+
client := testContext.Client
94+
instance := testContext.Instance
95+
96+
// Set-up instance to have scsi_id where we expect it
97+
err := testutils.MkdirAll(instance, "/lib/udev_containerized")
98+
Expect(err).To(BeNil(), "could not make scsi_id containerized directory")
99+
err = testutils.CopyFile(instance, "/lib/udev/scsi_id", "/lib/udev_containerized/scsi_id")
100+
Expect(err).To(BeNil(), "could not copy scsi_id to containerized directory")
101+
102+
// Create Disk
103+
volName, volID := createAndValidateUniqueZonalDisk(client, p, z)
104+
105+
defer func() {
106+
// Delete Disk
107+
err := client.DeleteVolume(volID)
108+
Expect(err).To(BeNil(), "DeleteVolume failed")
109+
110+
// Validate Disk Deleted
111+
_, err = computeService.Disks.Get(p, z, volName).Do()
112+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
113+
}()
114+
115+
// Attach Disk
116+
err = client.ControllerPublishVolume(volID, instance.GetNodeID())
117+
Expect(err).To(BeNil(), "ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID())
118+
119+
defer func() {
120+
// Detach Disk
121+
err = client.ControllerUnpublishVolume(volID, instance.GetNodeID())
122+
if err != nil {
123+
klog.Errorf("Failed to detach disk: %v", err)
124+
}
125+
126+
}()
84127

128+
// MESS UP THE symlink
129+
130+
err = testutils.RmAll(instance, "/dev/disk/by-id")
131+
Expect(err).To(BeNil(), "failed to remove /dev/by-id folder")
132+
133+
// Stage Disk
134+
stageDir := filepath.Join("/tmp/", volName, "stage")
135+
err = client.NodeStageExt4Volume(volID, stageDir)
136+
Expect(err).To(BeNil(), "failed to repair /dev/by-id symlink and stage volume")
137+
138+
defer func() {
139+
// Unstage Disk
140+
err = client.NodeUnstageVolume(volID, stageDir)
141+
if err != nil {
142+
klog.Errorf("Failed to unstage volume: %v", err)
143+
}
144+
fp := filepath.Join("/tmp/", volName)
145+
err = testutils.RmAll(instance, fp)
146+
if err != nil {
147+
klog.Errorf("Failed to rm file path %s: %v", fp, err)
148+
}
149+
}()
85150
})
86151

87152
It("Should create disks in correct zones when topology is specified", func() {

test/e2e/utils/utils.go

+18-2
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo) (*remote.TestContext
4949
endpoint := fmt.Sprintf("tcp://localhost:%s", port)
5050

5151
workspace := remote.NewWorkspaceDir("gce-pd-e2e-")
52-
driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver --endpoint=%s> %s/prog.out 2> %s/prog.err < /dev/null &'",
53-
workspace, endpoint, workspace, workspace)
52+
driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver -v=4 --endpoint=%s 2> %s/prog.out < /dev/null > /dev/null &'",
53+
workspace, endpoint, workspace)
5454

5555
config := &remote.ClientConfig{
5656
PkgPath: pkgPath,
@@ -202,3 +202,19 @@ func RmAll(instance *remote.InstanceInfo, filePath string) error {
202202
}
203203
return nil
204204
}
205+
206+
func MkdirAll(instance *remote.InstanceInfo, dir string) error {
207+
output, err := instance.SSH("mkdir", "-p", dir)
208+
if err != nil {
209+
return fmt.Errorf("failed to mkdir -p %s. Output: %v, errror: %v", dir, output, err)
210+
}
211+
return nil
212+
}
213+
214+
func CopyFile(instance *remote.InstanceInfo, src, dest string) error {
215+
output, err := instance.SSH("cp", src, dest)
216+
if err != nil {
217+
return fmt.Errorf("failed to copy %s to %s. Output: %v, errror: %v", src, dest, output, err)
218+
}
219+
return nil
220+
}

0 commit comments

Comments
 (0)