diff --git a/Dockerfile b/Dockerfile index 9a3ee846a..8f57331b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,11 +20,15 @@ ADD . . ARG TAG RUN CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-X main.vendorVersion='"${TAG:-latest}"' -extldflags "-static"' -o bin/gce-pd-csi-driver ./cmd/ +# MAD HACKS: Build a version first so we can take the scsi_id bin and put it somewhere else in our real build +FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 as base +RUN clean-install udev + # Start from Google Debian base FROM gcr.io/google-containers/debian-base-amd64:v2.0.0 COPY --from=builder /go/src/sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/bin/gce-pd-csi-driver /gce-pd-csi-driver - # Install necessary dependencies RUN clean-install util-linux e2fsprogs mount ca-certificates udev xfsprogs +COPY --from=base /lib/udev/scsi_id /lib/udev_containerized/scsi_id ENTRYPOINT ["/gce-pd-csi-driver"] diff --git a/cmd/main.go b/cmd/main.go index c089c4788..fbd41e763 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,10 +28,6 @@ import ( mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager" ) -func init() { - flag.Set("logtostderr", "true") -} - var ( endpoint = flag.String("endpoint", "unix:/tmp/csi.sock", "CSI endpoint") gceConfigFilePath = flag.String("cloud-config", "", "Path to GCE cloud provider config") @@ -49,6 +45,7 @@ func init() { // Use V(5) for GCE Cloud Provider Call informational logging // Use V(6) for extra repeated/polling information klog.InitFlags(flag.CommandLine) + flag.Set("logtostderr", "true") } func main() { diff --git a/pkg/gce-pd-csi-driver/node.go b/pkg/gce-pd-csi-driver/node.go index e2e483117..45a564e3c 100644 --- a/pkg/gce-pd-csi-driver/node.go +++ b/pkg/gce-pd-csi-driver/node.go @@ -528,7 +528,7 @@ func (ns *GCENodeServer) getDevicePath(volumeID string, partition string) (strin } devicePaths := ns.DeviceUtils.GetDiskByIdPaths(deviceName, partition) - devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths) + devicePath, err := ns.DeviceUtils.VerifyDevicePath(devicePaths, volumeKey.Name) if err != nil { return "", fmt.Errorf("error verifying GCE PD (%q) is attached: %v", volumeKey.Name, err) diff --git a/pkg/mount-manager/device-utils.go b/pkg/mount-manager/device-utils.go index ec9c88091..be76f8947 100644 --- a/pkg/mount-manager/device-utils.go +++ b/pkg/mount-manager/device-utils.go @@ -20,10 +20,13 @@ import ( "os/exec" "path" "path/filepath" + "regexp" "strings" + "time" - "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog" + pathutils "k8s.io/utils/path" ) const ( @@ -46,6 +49,14 @@ const ( // 'fsck' found errors but exited without correcting them fsckErrorsUncorrected = 4 defaultMountCommand = "mount" + // scsi_id output should be in the form of: + // 0Google PersistentDisk + scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$` +) + +var ( + // regex to parse scsi_id output and extract the serial + scsiRegex = regexp.MustCompile(scsiPattern) ) // DeviceUtils are a collection of methods that act on the devices attached @@ -57,7 +68,7 @@ type DeviceUtils interface { // VerifyDevicePath returns the first of the list of device paths that // exists on the machine, or an empty string if none exists - VerifyDevicePath(devicePaths []string) (string, error) + VerifyDevicePath(devicePaths []string, diskName string) (string, error) } type deviceUtils struct { @@ -85,74 +96,163 @@ func (m *deviceUtils) GetDiskByIdPaths(deviceName string, partition string) []st return devicePaths } -// Returns the first path that exists, or empty string if none exist. -func (m *deviceUtils) VerifyDevicePath(devicePaths []string) (string, error) { - sdBefore, err := filepath.Glob(diskSDPattern) - if err != nil { - // Seeing this error means that the diskSDPattern is malformed. - klog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err) +func existingDevicePath(devicePaths []string) (string, error) { + for _, devicePath := range devicePaths { + if pathExists, err := pathExists(devicePath); err != nil { + return "", fmt.Errorf("error checking if path exists: %v", err) + } else if pathExists { + return devicePath, nil + } } - sdBeforeSet := sets.NewString(sdBefore...) - // TODO(#69): Verify udevadm works as intended in driver - if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil { - // udevadm errors should not block disk detachment, log and continue - klog.Errorf("udevadmChangeToNewDrives failed with: %v", err) + return "", nil +} + +// getScsiSerial assumes that /lib/udev/scsi_id exists and will error if it +// doesnt. It is the callers responsibility to verify the existence of this +// tool. Calls scsi_id on the given devicePath to get the serial number reported +// by that device. +func getScsiSerial(devicePath string) (string, error) { + out, err := exec.Command( + "/lib/udev_containerized/scsi_id", + "--page=0x83", + "--whitelisted", + fmt.Sprintf("--device=%v", devicePath)).CombinedOutput() + if err != nil { + return "", fmt.Errorf("scsi_id failed for device %q with output %s: %v", devicePath, string(out), err) } - for _, path := range devicePaths { - if pathExists, err := pathExists(path); err != nil { - return "", fmt.Errorf("Error checking if path exists: %v", err) - } else if pathExists { - return path, nil - } + return parseScsiSerial(string(out)) +} + +// Parse the output returned by scsi_id and extract the serial number +func parseScsiSerial(output string) (string, error) { + substrings := scsiRegex.FindStringSubmatch(output) + if substrings == nil { + return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output) } - return "", nil + return substrings[1], nil } -// Triggers the application of udev rules by calling "udevadm trigger -// --action=change" for newly created "/dev/sd*" drives (exist only in -// after set). This is workaround for Issue #7972. Once the underlying -// issue has been resolved, this may be removed. -func udevadmChangeToNewDrives(sdBeforeSet sets.String) error { - sdAfter, err := filepath.Glob(diskSDPattern) +// VerifyDevicePath returns the first devicePath that maps to a real disk in the +// candidate devicePaths or an empty string if none is found. If +// /lib/udev_containerized/scsi_id exists it will attempt to fix any issues +// caused by missing paths or mismatched devices by running a udevadm --trigger. +func (m *deviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) { + var devicePath string + var err error + const ( + pollInterval = 500 * time.Millisecond + pollTimeout = 3 * time.Second + ) + + scsiIDPath := "/lib/udev_containerized/scsi_id" + exists, err := pathutils.Exists(pathutils.CheckFollowSymlink, scsiIDPath) if err != nil { - return fmt.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err) + return "", fmt.Errorf("failed to check scsi_id existence: %v", err) + } + if !exists { + // No SCSI ID tool, the driver should be containerized with the tool so + // maybe something is wrong with the build process + return "", fmt.Errorf("could not find scsi_id tool at %s, unable to verify device paths", scsiIDPath) } - for _, sd := range sdAfter { - if !sdBeforeSet.Has(sd) { - return udevadmChangeToDrive(sd) + err = wait.Poll(pollInterval, pollTimeout, func() (bool, error) { + var innerErr error + + devicePath, innerErr = existingDevicePath(devicePaths) + if innerErr != nil { + return false, fmt.Errorf("failed to check for existing device path: %v", innerErr) + } + + if len(devicePath) == 0 { + // Couldn't find the path so we need to find a /dev/sdx with the SCSI + // serial that matches diskName. Then we run udevadm trigger on that + // device to get the device to show up in /dev/by-id/ + innerErr := udevadmTriggerForDiskIfExists(diskName) + if innerErr != nil { + return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr) + } + // Go to next retry loop to get the deviceName again after + // potentially fixing it with the udev command + return false, nil + } + + // If there exists a devicePath we make sure disk at /dev/sdx matches the + // expected disk at devicePath by matching SCSI Serial to the disk name + devSDX, innerErr := filepath.EvalSymlinks(devicePath) + if innerErr != nil { + return false, fmt.Errorf("filepath.EvalSymlinks(%q) failed with %v", devicePath, innerErr) + } + // Check to make sure device path maps to the correct disk + if strings.Contains(devSDX, diskSDPath) { + scsiSerial, innerErr := getScsiSerial(devSDX) + if innerErr != nil { + return false, fmt.Errorf("couldn't get SCSI serial number for disk %s: %v", diskName, innerErr) + } + // SUCCESS! devicePath points to a /dev/sdx that has a SCSI serial + // equivilant to our disk name + if scsiSerial == diskName { + return true, nil + } } + // The devicePath is not mapped to the correct disk + innerErr = udevadmTriggerForDiskIfExists(diskName) + if innerErr != nil { + return false, fmt.Errorf("failed to trigger udevadm fix for disk %s: %v", diskName, innerErr) + } + // Go to next retry loop to get the deviceName again after + // potentially fixing it with the udev command + return false, nil + }) + + if err != nil { + return "", fmt.Errorf("failed to find and re-link disk %s with udevadm after retrying for %v: %v", diskName, pollTimeout, err) } - return nil + return devicePath, nil } -// Calls "udevadm trigger --action=change" on the specified drive. -// drivePath must be the block device path to trigger on, in the format "/dev/sd*", or a symlink to it. -// This is workaround for Issue #7972. Once the underlying issue has been resolved, this may be removed. -func udevadmChangeToDrive(drivePath string) error { - klog.V(4).Infof("udevadmChangeToDrive: drive=%q", drivePath) - - // Evaluate symlink, if any - drive, err := filepath.EvalSymlinks(drivePath) +func udevadmTriggerForDiskIfExists(diskName string) error { + sds, err := filepath.Glob(diskSDPattern) if err != nil { - return fmt.Errorf("udevadmChangeToDrive: filepath.EvalSymlinks(%q) failed with %v.", drivePath, err) + return fmt.Errorf("failed to filepath.Glob(\"%s\"): %v", diskSDPattern, err) } - // Check to make sure input is "/dev/sd*" - if !strings.Contains(drive, diskSDPath) { - return fmt.Errorf("udevadmChangeToDrive: expected input in the form \"%s\" but drive is %q.", diskSDPattern, drive) + for _, devSDX := range sds { + scsiSerial, err := getScsiSerial(devSDX) + if err != nil { + return fmt.Errorf("failed to get SCSI Serial num for %s: %v", devSDX, err) + } + if scsiSerial == diskName { + // Found the disk that we're looking for so run a trigger on it + // to resolve its /dev/by-id/ path + klog.Warningf("udevadm --trigger running to fix disk at path %s which has SCSI ID %s", devSDX, scsiSerial) + err := udevadmChangeToDrive(devSDX) + if err != nil { + return fmt.Errorf("failed to fix disk at path %s which has SCSI ID %s: %v", devSDX, scsiSerial, err) + } + return nil + } } + return fmt.Errorf("udevadm --trigger requested to fix disk %s but no such disk was found in %v", diskName, sds) +} +// Calls "udevadm trigger --action=change" on the specified drive. drivePath +// must be the block device path to trigger on, in the format "/dev/sd*", or a +// symlink to it. This is workaround for Issue #7972. Once the underlying issue +// has been resolved, this may be removed. +// udevadm takes a little bit to work its magic in the background so any callers +// should not expect the trigger to complete instantly and may need to poll for +// the change +func udevadmChangeToDrive(devSDX string) error { // Call "udevadm trigger --action=change --property-match=DEVNAME=/dev/sd..." - _, err = exec.Command( + out, err := exec.Command( "udevadm", "trigger", "--action=change", - fmt.Sprintf("--property-match=DEVNAME=%s", drive)).CombinedOutput() + fmt.Sprintf("--property-match=DEVNAME=%s", devSDX)).CombinedOutput() if err != nil { - return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with %v.", drive, err) + return fmt.Errorf("udevadmChangeToDrive: udevadm trigger failed for drive %q with output %s: %v.", devSDX, string(out), err) } return nil } diff --git a/pkg/mount-manager/fake-device-utils.go b/pkg/mount-manager/fake-device-utils.go index 030e11ec3..2c58e9ff1 100644 --- a/pkg/mount-manager/fake-device-utils.go +++ b/pkg/mount-manager/fake-device-utils.go @@ -30,7 +30,7 @@ func (m *fakeDeviceUtils) GetDiskByIdPaths(pdName string, partition string) []st } // Returns the first path that exists, or empty string if none exist. -func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string) (string, error) { +func (m *fakeDeviceUtils) VerifyDevicePath(devicePaths []string, diskName string) (string, error) { // Return any random device path to use as mount source return "/dev/disk/fake-path", nil } diff --git a/test/e2e/tests/setup_e2e_test.go b/test/e2e/tests/setup_e2e_test.go index 9c37286f0..1b067939a 100644 --- a/test/e2e/tests/setup_e2e_test.go +++ b/test/e2e/tests/setup_e2e_test.go @@ -89,6 +89,16 @@ var _ = BeforeSuite(func() { klog.Fatalf("Failed to setup instance %v: %v", nodeID, err) } + err = testutils.MkdirAll(i, "/lib/udev_containerized") + if err != nil { + klog.Fatalf("Could not make scsi_id containerized directory: %v", err) + } + + err = testutils.CopyFile(i, "/lib/udev/scsi_id", "/lib/udev_containerized/scsi_id") + if err != nil { + klog.Fatalf("could not copy scsi_id to containerized directory: %v", err) + } + klog.Infof("Creating new driver and client for node %s\n", i.GetName()) // Create new driver and client testContext, err := testutils.GCEClientAndDriverSetup(i) diff --git a/test/e2e/tests/single_zone_e2e_test.go b/test/e2e/tests/single_zone_e2e_test.go index b38846a78..6035fda2d 100644 --- a/test/e2e/tests/single_zone_e2e_test.go +++ b/test/e2e/tests/single_zone_e2e_test.go @@ -17,13 +17,17 @@ package tests import ( "context" "fmt" + "path/filepath" "strings" "time" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/klog" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" + mountmanager "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/mount-manager" + testutils "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/e2e/utils" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/test/remote" csi "github.com/container-storage-interface/spec/lib/go/csi" @@ -81,7 +85,78 @@ var _ = Describe("GCE PD CSI Driver", func() { // Attach Disk err := testAttachWriteReadDetach(volID, volName, instance, client, false /* readOnly */) Expect(err).To(BeNil(), "Failed to go through volume lifecycle") + }) + + It("Should automatically fix symlink errors between /dev/sdx and /dev/by-id if disk is not found", func() { + testContext := getRandomTestContext() + + p, z, _ := testContext.Instance.GetIdentity() + client := testContext.Client + instance := testContext.Instance + + // Set-up instance to have scsi_id where we expect it + + // Create Disk + volName, volID := createAndValidateUniqueZonalDisk(client, p, z) + + defer func() { + // Delete Disk + err := client.DeleteVolume(volID) + Expect(err).To(BeNil(), "DeleteVolume failed") + + // Validate Disk Deleted + _, err = computeService.Disks.Get(p, z, volName).Do() + Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found") + }() + // Attach Disk + err := client.ControllerPublishVolume(volID, instance.GetNodeID()) + Expect(err).To(BeNil(), "ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID()) + + defer func() { + // Detach Disk + err = client.ControllerUnpublishVolume(volID, instance.GetNodeID()) + if err != nil { + klog.Errorf("Failed to detach disk: %v", err) + } + + }() + + // MESS UP THE symlink + devicePaths := mountmanager.NewDeviceUtils().GetDiskByIdPaths(volName, "") + for _, devicePath := range devicePaths { + err = testutils.RmAll(instance, devicePath) + Expect(err).To(BeNil(), "failed to remove /dev/by-id folder") + } + + // Stage Disk + stageDir := filepath.Join("/tmp/", volName, "stage") + err = client.NodeStageExt4Volume(volID, stageDir) + Expect(err).To(BeNil(), "failed to repair /dev/by-id symlink and stage volume") + + // Validate that the link is correct + var validated bool + for _, devicePath := range devicePaths { + validated, err = testutils.ValidateLogicalLinkIsDisk(instance, devicePath, volName) + Expect(err).To(BeNil(), "failed to validate link %s is disk %s: %v", stageDir, volName, err) + if validated { + break + } + } + Expect(validated).To(BeTrue(), "could not find device in %v that links to volume %s", devicePaths, volName) + + defer func() { + // Unstage Disk + err = client.NodeUnstageVolume(volID, stageDir) + if err != nil { + klog.Errorf("Failed to unstage volume: %v", err) + } + fp := filepath.Join("/tmp/", volName) + err = testutils.RmAll(instance, fp) + if err != nil { + klog.Errorf("Failed to rm file path %s: %v", fp, err) + } + }() }) It("Should create disks in correct zones when topology is specified", func() { diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index c7300f1ac..4c311ad47 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -20,6 +20,7 @@ import ( "math/rand" "os" "path" + "regexp" "strconv" "strings" "time" @@ -49,8 +50,8 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo) (*remote.TestContext endpoint := fmt.Sprintf("tcp://localhost:%s", port) workspace := remote.NewWorkspaceDir("gce-pd-e2e-") - driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver --endpoint=%s> %s/prog.out 2> %s/prog.err < /dev/null &'", - workspace, endpoint, workspace, workspace) + driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver -v=4 --endpoint=%s 2> %s/prog.out < /dev/null > /dev/null &'", + workspace, endpoint, workspace) config := &remote.ClientConfig{ PkgPath: pkgPath, @@ -195,6 +196,14 @@ func GetBlockSizeInGb(instance *remote.InstanceInfo, devicePath string) (int64, return utilcommon.BytesToGb(n), nil } +func Symlink(instance *remote.InstanceInfo, src, dest string) error { + output, err := instance.SSH("ln", "-s", src, dest) + if err != nil { + return fmt.Errorf("failed to symlink from %s to %s. Output: %v, errror: %v", src, dest, output, err) + } + return nil +} + func RmAll(instance *remote.InstanceInfo, filePath string) error { output, err := instance.SSH("rm", "-rf", filePath) if err != nil { @@ -202,3 +211,54 @@ func RmAll(instance *remote.InstanceInfo, filePath string) error { } return nil } + +func MkdirAll(instance *remote.InstanceInfo, dir string) error { + output, err := instance.SSH("mkdir", "-p", dir) + if err != nil { + return fmt.Errorf("failed to mkdir -p %s. Output: %v, errror: %v", dir, output, err) + } + return nil +} + +func CopyFile(instance *remote.InstanceInfo, src, dest string) error { + output, err := instance.SSH("cp", src, dest) + if err != nil { + return fmt.Errorf("failed to copy %s to %s. Output: %v, errror: %v", src, dest, output, err) + } + return nil +} + +// ValidateLogicalLinkIsDisk takes a symlink location at "link" and finds the +// link location - it then finds the backing PD using scsi_id and validates that +// it is the same as diskName +func ValidateLogicalLinkIsDisk(instance *remote.InstanceInfo, link, diskName string) (bool, error) { + const ( + scsiPattern = `^0Google\s+PersistentDisk\s+([\S]+)\s*$` + sdPattern = "sd\\w" + ) + // regex to parse scsi_id output and extract the serial + scsiRegex := regexp.MustCompile(scsiPattern) + sdRegex := regexp.MustCompile(sdPattern) + + devSDX, err := instance.SSH("find", link, "-printf", "'%l'") + if err != nil { + return false, fmt.Errorf("failed to find %s's symbolic link. Output: %v, errror: %v", link, devSDX, err) + } + if len(devSDX) == 0 { + return false, nil + } + sdx := sdRegex.Find([]byte(devSDX)) + fullDevPath := path.Join("/dev/", string(sdx)) + scsiIDOut, err := instance.SSH("/lib/udev_containerized/scsi_id", "--page=0x83", "--whitelisted", fmt.Sprintf("--device=%v", fullDevPath)) + if err != nil { + return false, fmt.Errorf("failed to find %s's SCSI ID. Output: %v, errror: %v", devSDX, scsiIDOut, err) + } + scsiID := scsiRegex.FindStringSubmatch(scsiIDOut) + if len(scsiID) == 0 { + return false, fmt.Errorf("scsi_id output cannot be parsed: %s", scsiID) + } + if scsiID[1] != diskName { + return false, fmt.Errorf("scsiID %s did not match expected diskName %s", scsiID, diskName) + } + return true, nil +}