Skip to content

Commit a09c7c2

Browse files
authored
fix(xunix): improve handling of gpu library mounts (#129)
* Mounts symlinks to auto-added GPU libs to inner container * Tightens up gpuExtraRegex to avoid extraneous matches (e.g. libglib.so) * Adds GPU integration tests to include regression test for redhat * Adds GPU integration test with sample NVIDIA CUDA image
1 parent a9acf2c commit a09c7c2

File tree

4 files changed

+333
-17
lines changed

4 files changed

+333
-17
lines changed

integration/gpu_test.go

+111-8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"testing"
1010
"time"
1111

12+
"github.com/stretchr/testify/assert"
1213
"github.com/stretchr/testify/require"
1314

1415
"github.com/coder/envbox/integration/integrationtest"
@@ -41,8 +42,7 @@ func TestDocker_Nvidia(t *testing.T) {
4142
)
4243

4344
// Assert that we can run nvidia-smi in the inner container.
44-
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "nvidia-smi")
45-
require.NoError(t, err, "failed to run nvidia-smi in the inner container")
45+
assertInnerNvidiaSMI(ctx, t, ctID)
4646
})
4747

4848
t.Run("Redhat", func(t *testing.T) {
@@ -52,16 +52,29 @@ func TestDocker_Nvidia(t *testing.T) {
5252

5353
// Start the envbox container.
5454
ctID := startEnvboxCmd(ctx, t, integrationtest.RedhatImage, "root",
55-
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib64",
55+
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib",
5656
"--env", "CODER_ADD_GPU=true",
57-
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib64",
57+
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib",
5858
"--runtime=nvidia",
5959
"--gpus=all",
6060
)
6161

6262
// Assert that we can run nvidia-smi in the inner container.
63-
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "nvidia-smi")
64-
require.NoError(t, err, "failed to run nvidia-smi in the inner container")
63+
assertInnerNvidiaSMI(ctx, t, ctID)
64+
65+
// Make sure dnf still works. This checks for a regression due to
66+
// gpuExtraRegex matching `libglib.so` in the outer container.
67+
// This had a dependency on `libpcre.so.3` which would cause dnf to fail.
68+
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "dnf")
69+
if !assert.NoError(t, err, "failed to run dnf in the inner container") {
70+
t.Logf("dnf output:\n%s", strings.TrimSpace(out))
71+
}
72+
73+
// Make sure libglib.so is not present in the inner container.
74+
out, err = execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/x86_64-linux-gnu/libglib*")
75+
// An error is expected here.
76+
assert.Error(t, err, "libglib should not be present in the inner container")
77+
assert.Contains(t, out, "No such file or directory", "libglib should not be present in the inner container")
6578
})
6679

6780
t.Run("InnerUsrLibDirOverride", func(t *testing.T) {
@@ -79,11 +92,58 @@ func TestDocker_Nvidia(t *testing.T) {
7992
"--gpus=all",
8093
)
8194

82-
// Assert that the libraries end up in the expected location in the inner container.
83-
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-l", "/usr/lib/coder")
95+
// Assert that the libraries end up in the expected location in the inner
96+
// container.
97+
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/coder")
8498
require.NoError(t, err, "inner usr lib dir override failed")
8599
require.Regexp(t, `(?i)(libgl|nvidia|vulkan|cuda)`, out)
86100
})
101+
102+
t.Run("EmptyHostUsrLibDir", func(t *testing.T) {
103+
t.Parallel()
104+
ctx, cancel := context.WithCancel(context.Background())
105+
t.Cleanup(cancel)
106+
emptyUsrLibDir := t.TempDir()
107+
108+
// Start the envbox container.
109+
ctID := startEnvboxCmd(ctx, t, integrationtest.UbuntuImage, "root",
110+
"-v", emptyUsrLibDir+":/var/coder/usr/lib",
111+
"--env", "CODER_ADD_GPU=true",
112+
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib",
113+
"--runtime=nvidia",
114+
"--gpus=all",
115+
)
116+
117+
ofs := outerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*")
118+
// Assert invariant: the outer container has the files we expect.
119+
require.NotEmpty(t, ofs, "failed to list outer container files")
120+
// Assert that expected files are available in the inner container.
121+
assertInnerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*", ofs...)
122+
assertInnerNvidiaSMI(ctx, t, ctID)
123+
})
124+
125+
t.Run("CUDASample", func(t *testing.T) {
126+
t.Parallel()
127+
128+
ctx, cancel := context.WithCancel(context.Background())
129+
t.Cleanup(cancel)
130+
131+
// Start the envbox container.
132+
ctID := startEnvboxCmd(ctx, t, integrationtest.CUDASampleImage, "root",
133+
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib",
134+
"--env", "CODER_ADD_GPU=true",
135+
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib",
136+
"--runtime=nvidia",
137+
"--gpus=all",
138+
)
139+
140+
// Assert that we can run nvidia-smi in the inner container.
141+
assertInnerNvidiaSMI(ctx, t, ctID)
142+
143+
// Assert that /tmp/vectorAdd runs successfully in the inner container.
144+
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "/tmp/vectorAdd")
145+
require.NoError(t, err, "failed to run /tmp/vectorAdd in the inner container")
146+
})
87147
}
88148

89149
// dockerRuntimes returns the list of container runtimes available on the host.
@@ -101,6 +161,49 @@ func dockerRuntimes(t *testing.T) []string {
101161
return strings.Split(raw, "\n")
102162
}
103163

164+
// outerFiles returns the list of files in the outer container matching the
165+
// given pattern. It does this by running `ls -1` in the outer container.
166+
func outerFiles(ctx context.Context, t *testing.T, containerID, pattern string) []string {
167+
t.Helper()
168+
// We need to use /bin/sh -c to avoid the shell interpreting the glob.
169+
out, err := execContainerCmd(ctx, t, containerID, "/bin/sh", "-c", "ls -1 "+pattern)
170+
require.NoError(t, err, "failed to list outer container files")
171+
files := strings.Split(strings.TrimSpace(out), "\n")
172+
slices.Sort(files)
173+
return files
174+
}
175+
176+
// assertInnerFiles checks that all the files matching the given pattern exist in the
177+
// inner container.
178+
func assertInnerFiles(ctx context.Context, t *testing.T, containerID, pattern string, expected ...string) {
179+
t.Helper()
180+
181+
// Get the list of files in the inner container.
182+
// We need to use /bin/sh -c to avoid the shell interpreting the glob.
183+
out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "/bin/sh", "-c", "ls -1 "+pattern)
184+
require.NoError(t, err, "failed to list inner container files")
185+
innerFiles := strings.Split(strings.TrimSpace(out), "\n")
186+
187+
// Check that the expected files exist in the inner container.
188+
missingFiles := make([]string, 0)
189+
for _, expectedFile := range expected {
190+
if !slices.Contains(innerFiles, expectedFile) {
191+
missingFiles = append(missingFiles, expectedFile)
192+
}
193+
}
194+
require.Empty(t, missingFiles, "missing files in inner container: %s", strings.Join(missingFiles, ", "))
195+
}
196+
197+
// assertInnerNvidiaSMI checks that nvidia-smi runs successfully in the inner
198+
// container.
199+
func assertInnerNvidiaSMI(ctx context.Context, t *testing.T, containerID string) {
200+
t.Helper()
201+
// Assert that we can run nvidia-smi in the inner container.
202+
out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "nvidia-smi")
203+
require.NoError(t, err, "failed to run nvidia-smi in the inner container")
204+
require.Contains(t, out, "NVIDIA-SMI", "nvidia-smi output does not contain NVIDIA-SMI")
205+
}
206+
104207
// startEnvboxCmd starts the envbox container with the given arguments.
105208
// Ideally we would use ory/dockertest for this, but it doesn't support
106209
// specifying the runtime. We have alternatively used the docker client library,

integration/integrationtest/docker.go

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ const (
4242
UbuntuImage = "gcr.io/coder-dev-1/sreya/ubuntu-coder"
4343
// Redhat UBI9 image as of 2025-03-05
4444
RedhatImage = "registry.access.redhat.com/ubi9/ubi:9.5"
45+
// CUDASampleImage is a CUDA sample image from NVIDIA's container registry.
46+
// It contains a binary /tmp/vectorAdd which can be run to test the CUDA setup.
47+
CUDASampleImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2"
4548

4649
// RegistryImage is used to assert that we add certs
4750
// correctly to the docker daemon when pulling an image

xunix/gpu.go

+95-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"os"
77
"path/filepath"
88
"regexp"
9+
"slices"
910
"sort"
1011
"strings"
1112

@@ -17,9 +18,9 @@ import (
1718
)
1819

1920
var (
20-
gpuMountRegex = regexp.MustCompile("(?i)(nvidia|vulkan|cuda)")
21-
gpuExtraRegex = regexp.MustCompile("(?i)(libgl|nvidia|vulkan|cuda)")
22-
gpuEnvRegex = regexp.MustCompile("(?i)nvidia")
21+
gpuMountRegex = regexp.MustCompile(`(?i)(nvidia|vulkan|cuda)`)
22+
gpuExtraRegex = regexp.MustCompile(`(?i)(libgl(e|sx|\.)|nvidia|vulkan|cuda)`)
23+
gpuEnvRegex = regexp.MustCompile(`(?i)nvidia`)
2324
sharedObjectRegex = regexp.MustCompile(`\.so(\.[0-9\.]+)?$`)
2425
)
2526

@@ -39,6 +40,7 @@ func GPUEnvs(ctx context.Context) []string {
3940

4041
func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []mount.MountPoint, error) {
4142
var (
43+
afs = GetFS(ctx)
4244
mounter = Mounter(ctx)
4345
devices = []Device{}
4446
binds = []mount.MountPoint{}
@@ -64,6 +66,22 @@ func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []m
6466

6567
// If it's not in /dev treat it as a bind mount.
6668
binds = append(binds, m)
69+
// We also want to find any symlinks that point to the target.
70+
// This is important for the nvidia driver as it mounts the driver
71+
// files with the driver version appended to the end, and creates
72+
// symlinks that point to the actual files.
73+
links, err := SameDirSymlinks(afs, m.Path)
74+
if err != nil {
75+
log.Error(ctx, "find symlinks", slog.F("path", m.Path), slog.Error(err))
76+
} else {
77+
for _, link := range links {
78+
log.Debug(ctx, "found symlink", slog.F("link", link), slog.F("target", m.Path))
79+
binds = append(binds, mount.MountPoint{
80+
Path: link,
81+
Opts: []string{"ro"},
82+
})
83+
}
84+
}
6785
}
6886
}
6987

@@ -104,7 +122,11 @@ func usrLibGPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]mount
104122
return nil
105123
}
106124

107-
if !sharedObjectRegex.MatchString(path) || !gpuExtraRegex.MatchString(path) {
125+
if !gpuExtraRegex.MatchString(path) {
126+
return nil
127+
}
128+
129+
if !sharedObjectRegex.MatchString(path) {
108130
return nil
109131
}
110132

@@ -176,6 +198,75 @@ func recursiveSymlinks(afs FS, mountpoint string, path string) ([]string, error)
176198
return paths, nil
177199
}
178200

201+
// SameDirSymlinks returns all links in the same directory as `target` that
202+
// point to target, either indirectly or directly. Only symlinks in the same
203+
// directory as `target` are considered.
204+
func SameDirSymlinks(afs FS, target string) ([]string, error) {
205+
// Get the list of files in the directory of the target.
206+
fis, err := afero.ReadDir(afs, filepath.Dir(target))
207+
if err != nil {
208+
return nil, xerrors.Errorf("read dir %q: %w", filepath.Dir(target), err)
209+
}
210+
211+
// Do an initial pass to map all symlinks to their destinations.
212+
allLinks := make(map[string]string)
213+
for _, fi := range fis {
214+
// Ignore non-symlinks.
215+
if fi.Mode()&os.ModeSymlink == 0 {
216+
continue
217+
}
218+
219+
absPath := filepath.Join(filepath.Dir(target), fi.Name())
220+
link, err := afs.Readlink(filepath.Join(filepath.Dir(target), fi.Name()))
221+
if err != nil {
222+
return nil, xerrors.Errorf("readlink %q: %w", fi.Name(), err)
223+
}
224+
225+
if !filepath.IsAbs(link) {
226+
link = filepath.Join(filepath.Dir(target), link)
227+
}
228+
allLinks[absPath] = link
229+
}
230+
231+
// Now we can start checking for symlinks that point to the target.
232+
var (
233+
found = make([]string, 0)
234+
// Set an arbitrary upper limit to prevent infinite loops.
235+
maxIterations = 10
236+
)
237+
for range maxIterations {
238+
var foundThisTime bool
239+
for linkName, linkDest := range allLinks {
240+
// Ignore symlinks that point outside of target's directory.
241+
if filepath.Dir(linkName) != filepath.Dir(target) {
242+
continue
243+
}
244+
245+
// If the symlink points to the target, add it to the list.
246+
if linkDest == target {
247+
if !slices.Contains(found, linkName) {
248+
found = append(found, linkName)
249+
foundThisTime = true
250+
}
251+
}
252+
253+
// If the symlink points to another symlink that we already determined
254+
// points to the target, add it to the list.
255+
if slices.Contains(found, linkDest) {
256+
if !slices.Contains(found, linkName) {
257+
found = append(found, linkName)
258+
foundThisTime = true
259+
}
260+
}
261+
}
262+
// If we didn't find any new symlinks, we're done.
263+
if !foundThisTime {
264+
break
265+
}
266+
}
267+
return found, nil
268+
}
269+
179270
// TryUnmountProcGPUDrivers unmounts any GPU-related mounts under /proc as it causes
180271
// issues when creating any container in some cases. Errors encountered while
181272
// unmounting are treated as non-fatal.

0 commit comments

Comments
 (0)