Skip to content

Update VerifyDevicePath logic to run udevadm --trigger to fix scenarios where disk is not found or is wrong device #459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ ADD . .
ARG TAG
RUN CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-X main.vendorVersion='"${TAG:-latest}"' -extldflags "-static"' -o bin/gce-pd-csi-driver ./cmd/

# MAD HACKS: Build a version first so we can take the scsi_id bin and put it somewhere else in our real build
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 as base
RUN clean-install udev

# Start from Google Debian base
FROM gcr.io/google-containers/debian-base-amd64:v2.0.0
COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver

# Install necessary dependencies
RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs
COPY --from=base /lib/udev/scsi_id /lib/udev_containerized/scsi_id

ENTRYPOINT ["/gce-pd-csi-driver"]
5 changes: 1 addition & 4 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@ import (
mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager"
)

func init() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we had two init functions before, I don't know if that was causing issues but it doesn't hurt to fix

flag.Set("logtostderr", "true")
}

var (
endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint")
gceConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config")
Expand All @@ -49,6 +45,7 @@ func init() {
// Use V(5) for GCE Cloud Provider Call informational logging
// Use V(6) for extra repeated/polling information
klog.InitFlags(flag.CommandLine)
flag.Set("logtostderr", "true")
}

func main() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/gce-pd-csi-driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ func (ns *GCENodeServer) getDevicePath(volumeID string, partition string) (strin
}

devicePaths := ns.DeviceUtils.GetDiskByIdPaths(deviceName, partition)
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths)
devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths, volumeKey.Name)

if err != nil {
return "", fmt.Errorf("error verifying GCE PD (%q) is attached: %v", volumeKey.Name, err)
Expand Down
192 changes: 146 additions & 46 deletions pkg/mount-manager/device-utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ import (
"os/exec"
"path"
"path/filepath"
"regexp"
"strings"
"time"

"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog"
pathutils "k8s.io/utils/path"
)

const (
Expand All @@ -46,6 +49,14 @@ const (
// 'fsck' found errors but exited without correcting them
fsckErrorsUncorrected = 4
defaultMountCommand = "mount"
// scsi_id output should be in the form of:
// 0Google PersistentDisk <disk name>
scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$`
)

var (
// regex to parse scsi_id output and extract the serial
scsiRegex = regexp.MustCompile(scsiPattern)
)

// DeviceUtils are a collection of methods that act on the devices attached
Expand All @@ -57,7 +68,7 @@ type DeviceUtils interface {

// VerifyDevicePath returns the first of the list of device paths that
// exists on the machine, or an empty string if none exists
VerifyDevicePath(devicePaths []string) (string, error)
VerifyDevicePath(devicePaths []string, diskName string) (string, error)
}

type deviceUtils struct {
Expand Down Expand Up @@ -85,74 +96,163 @@ func (m *deviceUtils) GetDiskByIdPaths(deviceName string, partition string) []st
return devicePaths
}

// Returns the first path that exists, or empty string if none exist.
func (m *deviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
sdBefore, err := filepath.Glob(diskSDPattern)
if err != nil {
// Seeing this error means that the diskSDPattern is malformed.
klog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
func existingDevicePath(devicePaths []string) (string, error) {
for _, devicePath := range devicePaths {
if pathExists, err := pathExists(devicePath); err != nil {
return "", fmt.Errorf("error checking if path exists: %v", err)
} else if pathExists {
return devicePath, nil
}
}
sdBeforeSet := sets.NewString(sdBefore...)
// TODO(#69): Verify udevadm works as intended in driver
if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil {
// udevadm errors should not block disk detachment, log and continue
klog.Errorf("udevadmChangeToNewDrives failed with: %v", err)
return "", nil
}

// getScsiSerial assumes that /lib/udev/scsi_id exists and will error if it
// doesnt. It is the callers responsibility to verify the existence of this
// tool. Calls scsi_id on the given devicePath to get the serial number reported
// by that device.
func getScsiSerial(devicePath string) (string, error) {
out, err := exec.Command(
"/lib/udev_containerized/scsi_id",
"--page=0x83",
"--whitelisted",
fmt.Sprintf("--device=%v", devicePath)).CombinedOutput()
if err != nil {
return "", fmt.Errorf("scsi_id failed for device %q with output %s: %v", devicePath, string(out), err)
}

for _, path := range devicePaths {
if pathExists, err := pathExists(path); err != nil {
return "", fmt.Errorf("Error checking if path exists: %v", err)
} else if pathExists {
return path, nil
}
return parseScsiSerial(string(out))
}

// Parse the output returned by scsi_id and extract the serial number
func parseScsiSerial(output string) (string, error) {
substrings := scsiRegex.FindStringSubmatch(output)
if substrings == nil {
return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output)
}

return "", nil
return substrings[1], nil
}

// Triggers the application of udev rules by calling "udevadm trigger
// --action=change" for newly created "/dev/sd*" drives (exist only in
// after set). This is workaround for Issue #7972. Once the underlying
// issue has been resolved, this may be removed.
func udevadmChangeToNewDrives(sdBeforeSet sets.String) error {
sdAfter, err := filepath.Glob(diskSDPattern)
// VerifyDevicePath returns the first devicePath that maps to a real disk in the
// candidate devicePaths or an empty string if none is found. If
// /lib/udev_containerized/scsi_id exists it will attempt to fix any issues
// caused by missing paths or mismatched devices by running a udevadm --trigger.
func (m *deviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need devicePaths as an argument, or can we generate it inside this function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we don't really have to, but then we would need to add partition as an argument. (I can follow up but probably out of scope for this PR)

var devicePath string
var err error
const (
pollInterval = 500 * time.Millisecond
pollTimeout = 3 * time.Second
)

scsiIDPath := "/lib/udev_containerized/scsi_id"
exists, err := pathutils.Exists(pathutils.CheckFollowSymlink, scsiIDPath)
if err != nil {
return fmt.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
return "", fmt.Errorf("failed to check scsi_id existence: %v", err)
}
if !exists {
// No SCSI ID tool, the driver should be containerized with the tool so
// maybe something is wrong with the build process
return "", fmt.Errorf("could not find scsi_id tool at %s, unable to verify device paths", scsiIDPath)
}

for _, sd := range sdAfter {
if !sdBeforeSet.Has(sd) {
return udevadmChangeToDrive(sd)
err = wait.Poll(pollInterval, pollTimeout, func() (bool, error) {
var innerErr error

devicePath, innerErr = existingDevicePath(devicePaths)
if innerErr != nil {
return false, fmt.Errorf("failed to check for existing device path: %v", innerErr)
}

if len(devicePath) == 0 {
// Couldn't find the path so we need to find a /dev/sdx with the SCSI
// serial that matches diskName. Then we run udevadm trigger on that
// device to get the device to show up in /dev/by-id/
innerErr := udevadmTriggerForDiskIfExists(diskName)
if innerErr != nil {
return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr)
}
// Go to next retry loop to get the deviceName again after
// potentially fixing it with the udev command
return false, nil
}

// If there exists a devicePath we make sure disk at /dev/sdx matches the
// expected disk at devicePath by matching SCSI Serial to the disk name
devSDX, innerErr := filepath.EvalSymlinks(devicePath)
if innerErr != nil {
return false, fmt.Errorf("filepath.EvalSymlinks(%q) failed with %v", devicePath, innerErr)
}
// Check to make sure device path maps to the correct disk
if strings.Contains(devSDX, diskSDPath) {
scsiSerial, innerErr := getScsiSerial(devSDX)
if innerErr != nil {
return false, fmt.Errorf("couldn't get SCSI serial number for disk %s: %v", diskName, innerErr)
}
// SUCCESS! devicePath points to a /dev/sdx that has a SCSI serial
// equivilant to our disk name
if scsiSerial == diskName {
return true, nil
}
}
// The devicePath is not mapped to the correct disk
innerErr = udevadmTriggerForDiskIfExists(diskName)
if innerErr != nil {
return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr)
}
// Go to next retry loop to get the deviceName again after
// potentially fixing it with the udev command
return false, nil
})

if err != nil {
return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %v", diskName, pollTimeout, err)
}

return nil
return devicePath, nil
}

// Calls "udevadm trigger --action=change" on the specified drive.
// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it.
// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed.
func udevadmChangeToDrive(drivePath string) error {
klog.V(4).Infof("udevadmChangeToDrive: drive=%q", drivePath)

// Evaluate symlink, if any
drive, err := filepath.EvalSymlinks(drivePath)
func udevadmTriggerForDiskIfExists(diskName string) error {
sds, err := filepath.Glob(diskSDPattern)
if err != nil {
return fmt.Errorf("udevadmChangeToDrive: filepath.EvalSymlinks(%q) failed with %v.", drivePath, err)
return fmt.Errorf("failed to filepath.Glob(\"%s\"): %v", diskSDPattern, err)
}
// Check to make sure input is "/dev/sd*"
if !strings.Contains(drive, diskSDPath) {
return fmt.Errorf("udevadmChangeToDrive: expected input in the form \"%s\" but drive is %q.", diskSDPattern, drive)
for _, devSDX := range sds {
scsiSerial, err := getScsiSerial(devSDX)
if err != nil {
return fmt.Errorf("failed to get SCSI Serial num for %s: %v", devSDX, err)
}
if scsiSerial == diskName {
// Found the disk that we're looking for so run a trigger on it
// to resolve its /dev/by-id/ path
klog.Warningf("udevadm --trigger running to fix disk at path %s which has SCSI ID %s", devSDX, scsiSerial)
err := udevadmChangeToDrive(devSDX)
if err != nil {
return fmt.Errorf("failed to fix disk at path %s which has SCSI ID %s: %v", devSDX, scsiSerial, err)
}
return nil
}
}
return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in %v", diskName, sds)
}

// Calls "udevadm trigger --action=change" on the specified drive. drivePath
// must be the block device path to trigger on, in the format "/dev/sd*", or a
// symlink to it. This is workaround for Issue #7972. Once the underlying issue
// has been resolved, this may be removed.
// udevadm takes a little bit to work its magic in the background so any callers
// should not expect the trigger to complete instantly and may need to poll for
// the change
func udevadmChangeToDrive(devSDX string) error {
// Call "udevadm trigger --action=change --property-match=DEVNAME=/dev/sd..."
_, err = exec.Command(
out, err := exec.Command(
"udevadm",
"trigger",
"--action=change",
fmt.Sprintf("--property-match=DEVNAME=%s", drive)).CombinedOutput()
fmt.Sprintf("--property-match=DEVNAME=%s", devSDX)).CombinedOutput()
if err != nil {
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with %v.", drive, err)
return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with output %s: %v.", devSDX, string(out), err)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/mount-manager/fake-device-utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func (m *fakeDeviceUtils) GetDiskByIdPaths(pdName string, partition string) []st
}

// Returns the first path that exists, or empty string if none exist.
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string) (string, error) {
func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) {
// Return any random device path to use as mount source
return "/dev/disk/fake-path", nil
}
10 changes: 10 additions & 0 deletions test/e2e/tests/setup_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@ var _ = BeforeSuite(func() {
klog.Fatalf("Failed to setup instance %v: %v", nodeID, err)
}

err = testutils.MkdirAll(i, "/lib/udev_containerized")
if err != nil {
klog.Fatalf("Could not make scsi_id containerized directory: %v", err)
}

err = testutils.CopyFile(i, "/lib/udev/scsi_id", "/lib/udev_containerized/scsi_id")
if err != nil {
klog.Fatalf("could not copy scsi_id to containerized directory: %v", err)
}

klog.Infof("Creating new driver and client for node %s\n", i.GetName())
// Create new driver and client
testContext, err := testutils.GCEClientAndDriverSetup(i)
Expand Down
Loading