Skip to content

Commit a2c0480

Browse files
committed
Update VerifyDevicePath logic to run udevadm --trigger to fix scenarios where disk is not found or is wrong device
1 parent dd14907 commit a2c0480

File tree

7 files changed

+225
-55
lines changed

7 files changed

+225
-55
lines changed

Dockerfile

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ ADD . .
2020
ARG TAG
2121
RUN CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-X main.vendorVersion='"${TAG:-latest}"' -extldflags "-static"' -o bin/gce-pd-csi-driver ./cmd/
2222

23+
# MAD HACKS: Build a version first so we can take the scsi_id bin and put it somewhere else in our real build
24+
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 as base
25+
RUN clean-install udev
26+
2327
# Start from Google Debian base
2428
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0
2529
COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver
26-
2730
# Install necessary dependencies
2831
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs
32+
COPY --from=base /lib/udev/scsi_id /lib/udev_containerized/scsi_id
2933

3034
ENTRYPOINT ["/gce-pd-csi-driver"]

cmd/main.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@ import (
2828
mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager"
2929
)
3030

31-
func init() {
32-
flag.Set("logtostderr", "true")
33-
}
34-
3531
var (
3632
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
3733
gceConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
@@ -49,6 +45,7 @@ func init() {
4945
// Use V(5) for GCE Cloud Provider Call informational logging
5046
// Use V(6) for extra repeated/polling information
5147
klog.InitFlags(flag.CommandLine)
48+
flag.Set("logtostderr", "true")
5249
}
5350

5451
func main() {

pkg/gce-pd-csi-driver/node.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ func (ns *GCENodeServer) getDevicePath(volumeID string, partition string) (strin
528528
}
529529

530530
devicePaths := ns.DeviceUtils.GetDiskByIdPaths(deviceName, partition)
531-
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths)
531+
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths, volumeKey.Name)
532532

533533
if err != nil {
534534
return "", fmt.Errorf("error verifying GCE PD (%q) is attached: %v", volumeKey.Name, err)

pkg/mount-manager/device-utils.go

+127-46
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import (
2020
"os/exec"
2121
"path"
2222
"path/filepath"
23+
"regexp"
2324
"strings"
25+
"time"
2426

25-
"k8s.io/apimachinery/pkg/util/sets"
2627
"k8s.io/klog"
28+
pathutils "k8s.io/utils/path"
2729
)
2830

2931
const (
@@ -46,6 +48,14 @@ const (
4648
// 'fsck' found errors but exited without correcting them
4749
fsckErrorsUncorrected = 4
4850
defaultMountCommand = "mount"
51+
// scsi_id output should be in the form of:
52+
// 0Google PersistentDisk <disk name>
53+
scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$`
54+
)
55+
56+
var (
57+
// regex to parse scsi_id output and extract the serial
58+
scsiRegex = regexp.MustCompile(scsiPattern)
4959
)
5060

5161
// DeviceUtils are a collection of methods that act on the devices attached
@@ -57,7 +67,7 @@ type DeviceUtils interface {
5767

5868
// VerifyDevicePath returns the first of the list of device paths that
5969
// exists on the machine, or an empty string if none exists
60-
VerifyDevicePath(devicePaths []string) (string, error)
70+
VerifyDevicePath(devicePaths []string, diskName string) (string, error)
6171
}
6272

6373
type deviceUtils struct {
@@ -85,75 +95,146 @@ func (m *deviceUtils) GetDiskByIdPaths(deviceName string, partition string) []st
8595
return devicePaths
8696
}
8797

88-
// Returns the first path that exists, or empty string if none exist.
89-
func (m *deviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
90-
sdBefore, err := filepath.Glob(diskSDPattern)
91-
if err != nil {
92-
// Seeing this error means that the diskSDPattern is malformed.
93-
klog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
98+
func existingDevicePath(devicePaths []string) (string, error) {
99+
for _, devicePath := range devicePaths {
100+
if pathExists, err := pathExists(devicePath); err != nil {
101+
return "", fmt.Errorf("error checking if path exists: %v", err)
102+
} else if pathExists {
103+
return devicePath, nil
104+
}
94105
}
95-
sdBeforeSet := sets.NewString(sdBefore...)
96-
// TODO(#69): Verify udevadm works as intended in driver
97-
if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil {
98-
// udevadm errors should not block disk detachment, log and continue
99-
klog.Errorf("udevadmChangeToNewDrives failed with: %v", err)
106+
return "", nil
107+
}
108+
109+
// getScsiSerial assumes that /lib/udev/scsi_id exists and will error if it
110+
// doesnt. It is the callers responsibility to verify the existence of this
111+
// tool. Calls scsi_id on the given devicePath to get the serial number reported
112+
// by that device.
113+
func getScsiSerial(devicePath string) (string, error) {
114+
out, err := exec.Command(
115+
"/lib/udev_containerized/scsi_id",
116+
"--page=0x83",
117+
"--whitelisted",
118+
fmt.Sprintf("--device=%v", devicePath)).CombinedOutput()
119+
if err != nil {
120+
return "", fmt.Errorf("scsi_id failed for device %q with output %s: %v", devicePath, string(out), err)
100121
}
101122

102-
for _, path := range devicePaths {
103-
if pathExists, err := pathExists(path); err != nil {
104-
return "", fmt.Errorf("Error checking if path exists: %v", err)
105-
} else if pathExists {
106-
return path, nil
107-
}
123+
return parseScsiSerial(string(out))
124+
}
125+
126+
// Parse the output returned by scsi_id and extract the serial number
127+
func parseScsiSerial(output string) (string, error) {
128+
substrings := scsiRegex.FindStringSubmatch(output)
129+
if substrings == nil {
130+
return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output)
108131
}
109132

110-
return "", nil
133+
return substrings[1], nil
111134
}
112135

113-
// Triggers the application of udev rules by calling "udevadm trigger
114-
// --action=change" for newly created "/dev/sd*" drives (exist only in
115-
// after set). This is workaround for Issue #7972. Once the underlying
116-
// issue has been resolved, this may be removed.
117-
func udevadmChangeToNewDrives(sdBeforeSet sets.String) error {
118-
sdAfter, err := filepath.Glob(diskSDPattern)
136+
// VerifyDevicePath returns the first devicePath that maps to a real disk in the
137+
// candidate devicePaths or an empty string if none is found. If
138+
// /lib/udev/scsi_id exists it will attempt to fix any issues caused by missing
139+
// paths or mismatched devices by running a udevadm --trigger.
140+
func (m *deviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
141+
devicePath, err := existingDevicePath(devicePaths)
119142
if err != nil {
120-
return fmt.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
143+
return "", fmt.Errorf("failed to check for existing device path: %v", err)
121144
}
122145

123-
for _, sd := range sdAfter {
124-
if !sdBeforeSet.Has(sd) {
125-
return udevadmChangeToDrive(sd)
146+
scsiIDPath := "/lib/udev_containerized/scsi_id"
147+
exists, err := pathutils.Exists(pathutils.CheckFollowSymlink, scsiIDPath)
148+
if err != nil {
149+
return "", fmt.Errorf("failed to check scsi_id existence: %v", err)
150+
}
151+
if !exists {
152+
// No SCSI ID tool, no udevadm fixing is possible so just return best
153+
// effort devicePath.
154+
klog.Warningf("Could not find scsi_id tool at %s, no udevadm fixing is possible so we just return the best effort devicePath", scsiIDPath)
155+
return devicePath, nil
156+
}
157+
158+
if len(devicePath) == 0 {
159+
// Couldn't find the path so we need to find a /dev/sdx with the SCSI
160+
// serial that matches diskName. Then we run udevadm trigger on that
161+
// device to get the device to show up in /dev/by-id/
162+
err := udevadmTriggerForDiskIfExists(diskName)
163+
if err != nil {
164+
return "", fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, err)
126165
}
166+
// Try get the deviceName again after potentially fixing it with the
167+
// udev command
168+
return existingDevicePath(devicePaths)
127169
}
128170

129-
return nil
130-
}
171+
// If there exists a devicePath we make sure disk at /dev/sdx matches the
172+
// expected disk at devicePath by matching SCSI Serial to the disk name
173+
devSDX, err := filepath.EvalSymlinks(devicePath)
174+
if err != nil {
175+
return "", fmt.Errorf("filepath.EvalSymlinks(%q) failed with %v.", devicePath, err)
176+
}
177+
// Check to make sure device path maps to the correct disk
178+
if strings.Contains(devSDX, diskSDPath) {
179+
scsiSerial, err := getScsiSerial(devSDX)
180+
if err != nil {
181+
return "", fmt.Errorf("couldn't get SCSI serial number for disk %s: %v", diskName, err)
182+
}
131183

132-
// Calls "udevadm trigger --action=change" on the specified drive.
133-
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
134-
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
135-
func udevadmChangeToDrive(drivePath string) error {
136-
klog.V(4).Infof("udevadmChangeToDrive: drive=%q", drivePath)
184+
if scsiSerial == diskName {
185+
return devicePath, nil
186+
}
187+
}
188+
// The devicePath is not mapped to the correct disk
189+
err = udevadmTriggerForDiskIfExists(diskName)
190+
if err != nil {
191+
return "", fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, err)
192+
}
193+
return devicePath, nil
194+
}
137195

138-
// Evaluate symlink, if any
139-
drive, err := filepath.EvalSymlinks(drivePath)
196+
func udevadmTriggerForDiskIfExists(diskName string) error {
197+
sds, err := filepath.Glob(diskSDPattern)
140198
if err != nil {
141-
return fmt.Errorf("udevadmChangeToDrive: filepath.EvalSymlinks(%q) failed with %v.", drivePath, err)
199+
return fmt.Errorf("failed to filepath.Glob(\"%s\"): %v", diskSDPattern, err)
142200
}
143-
// Check to make sure input is "/dev/sd*"
144-
if !strings.Contains(drive, diskSDPath) {
145-
return fmt.Errorf("udevadmChangeToDrive: expected input in the form \"%s\" but drive is %q.", diskSDPattern, drive)
201+
for _, devSDX := range sds {
202+
scsiSerial, err := getScsiSerial(devSDX)
203+
if err != nil {
204+
return fmt.Errorf("failed to get SCSI Serial num for %s: %v", devSDX, err)
205+
}
206+
if scsiSerial == diskName {
207+
// Found the disk that we're looking for so run a trigger on it
208+
// to resolve its /dev/by-id/ path
209+
klog.Warningf("udevadm --trigger running to fix disk at path %s which has SCSI ID %s", devSDX, scsiSerial)
210+
err := udevadmChangeToDrive(devSDX)
211+
if err != nil {
212+
return fmt.Errorf("failed to fix disk at path %s which has SCSI ID %s: %v", devSDX, scsiSerial, err)
213+
}
214+
return nil
215+
}
146216
}
217+
klog.Warningf("udevadm --trigger requested to fix disk %s but no such disk was found in %v", diskName, sds)
218+
return nil
219+
}
147220

221+
// Calls "udevadm trigger --action=change" on the specified drive.
222+
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
223+
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
224+
func udevadmChangeToDrive(devSDX string) error {
148225
// Call "udevadm trigger --action=change --property-match=DEVNAME=/dev/sd..."
149-
_, err = exec.Command(
226+
out, err := exec.Command(
150227
"udevadm",
151228
"trigger",
152229
"--action=change",
153-
fmt.Sprintf("--property-match=DEVNAME=%s", drive)).CombinedOutput()
230+
fmt.Sprintf("--property-match=DEVNAME=%s", devSDX)).CombinedOutput()
154231
if err != nil {
155-
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with %v.", drive, err)
232+
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with output %s: %v.", devSDX, string(out), err)
156233
}
234+
// udevadm takes a little bit to work its magic in the background - since
235+
// this fix only gets called when something is wrong and we *need* the
236+
// udevadm we wait a second or two before returning a successful udevadm
237+
time.Sleep(time.Second)
157238
return nil
158239
}
159240

pkg/mount-manager/fake-device-utils.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func (m *fakeDeviceUtils) GetDiskByIdPaths(pdName string, partition string) []st
3030
}
3131

3232
// Returns the first path that exists, or empty string if none exist.
33-
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
33+
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
3434
// Return any random device path to use as mount source
3535
return "/dev/disk/fake-path", nil
3636
}

test/e2e/tests/single_zone_e2e_test.go

+64
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ package tests
1717
import (
1818
"context"
1919
"fmt"
20+
"path/filepath"
2021
"strings"
2122
"time"
2223

2324
"k8s.io/apimachinery/pkg/util/uuid"
2425
"k8s.io/apimachinery/pkg/util/wait"
26+
"k8s.io/klog"
2527
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
2628
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
29+
testutils "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/e2e/utils"
2730
"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/remote"
2831

2932
csi "github.com/container-storage-interface/spec/lib/go/csi"
@@ -81,7 +84,68 @@ var _ = Describe("GCE PD CSI Driver", func() {
8184
// Attach Disk
8285
err := testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */)
8386
Expect(err).To(BeNil(), "Failed to go through volume lifecycle")
87+
})
88+
89+
It("Should automatically fix symlink errors between /dev/sdx and /dev/by-id if disk is not found", func() {
90+
testContext := getRandomTestContext()
91+
92+
p, z, _ := testContext.Instance.GetIdentity()
93+
client := testContext.Client
94+
instance := testContext.Instance
95+
96+
// Set-up instance to have scsi_id where we expect it
97+
err := testutils.MkdirAll(instance, "/lib/udev_containerized")
98+
Expect(err).To(BeNil(), "could not make scsi_id containerized directory")
99+
err = testutils.CopyFile(instance, "/lib/udev/scsi_id", "/lib/udev_containerized/scsi_id")
100+
Expect(err).To(BeNil(), "could not copy scsi_id to containerized directory")
84101

102+
// Create Disk
103+
volName, volID := createAndValidateUniqueZonalDisk(client, p, z)
104+
105+
defer func() {
106+
// Delete Disk
107+
err := client.DeleteVolume(volID)
108+
Expect(err).To(BeNil(), "DeleteVolume failed")
109+
110+
// Validate Disk Deleted
111+
_, err = computeService.Disks.Get(p, z, volName).Do()
112+
Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found")
113+
}()
114+
115+
// Attach Disk
116+
err = client.ControllerPublishVolume(volID, instance.GetNodeID())
117+
Expect(err).To(BeNil(), "ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID())
118+
119+
defer func() {
120+
// Detach Disk
121+
err = client.ControllerUnpublishVolume(volID, instance.GetNodeID())
122+
if err != nil {
123+
klog.Errorf("Failed to detach disk: %v", err)
124+
}
125+
126+
}()
127+
128+
// MESS UP THE symlink
129+
err = testutils.RmAll(instance, "/dev/disk/by-id")
130+
Expect(err).To(BeNil(), "failed to remove /dev/by-id folder")
131+
132+
// Stage Disk
133+
stageDir := filepath.Join("/tmp/", volName, "stage")
134+
err = client.NodeStageExt4Volume(volID, stageDir)
135+
Expect(err).To(BeNil(), "failed to repair /dev/by-id symlink and stage volume")
136+
137+
defer func() {
138+
// Unstage Disk
139+
err = client.NodeUnstageVolume(volID, stageDir)
140+
if err != nil {
141+
klog.Errorf("Failed to unstage volume: %v", err)
142+
}
143+
fp := filepath.Join("/tmp/", volName)
144+
err = testutils.RmAll(instance, fp)
145+
if err != nil {
146+
klog.Errorf("Failed to rm file path %s: %v", fp, err)
147+
}
148+
}()
85149
})
86150

87151
It("Should create disks in correct zones when topology is specified", func() {

0 commit comments

Comments
 (0)