Skip to content

Commit 53be214

Browse files
committed
Refactor how opaque device configs are handled
Previously, each config was being applied independently to each request that referenced it. However, some configs may need to operate collectively on all of the requests they are associated with it. The code has been refactored to handle this situation. Additionally, the code to define the ContainerEdits for any custom config has been moved into the config code itself to better encapsulate it. Signed-off-by: Kevin Klues <[email protected]>
1 parent 5b1228f commit 53be214

File tree

4 files changed

+195
-140
lines changed

4 files changed

+195
-140
lines changed

README.md

+39-39
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ items:
127127
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
128128
capacity:
129129
memory: 80Gi
130-
name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
130+
name: gpu-0
131131
- basic:
132132
attributes:
133133
driverVersion:
@@ -140,7 +140,7 @@ items:
140140
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
141141
capacity:
142142
memory: 80Gi
143-
name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
143+
name: gpu-1
144144
- basic:
145145
attributes:
146146
driverVersion:
@@ -153,7 +153,7 @@ items:
153153
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
154154
capacity:
155155
memory: 80Gi
156-
name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
156+
name: gpu-2
157157
- basic:
158158
attributes:
159159
driverVersion:
@@ -166,7 +166,7 @@ items:
166166
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
167167
capacity:
168168
memory: 80Gi
169-
name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
169+
name: gpu-3
170170
- basic:
171171
attributes:
172172
driverVersion:
@@ -179,7 +179,7 @@ items:
179179
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
180180
capacity:
181181
memory: 80Gi
182-
name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
182+
name: gpu-4
183183
- basic:
184184
attributes:
185185
driverVersion:
@@ -192,7 +192,7 @@ items:
192192
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
193193
capacity:
194194
memory: 80Gi
195-
name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
195+
name: gpu-5
196196
- basic:
197197
attributes:
198198
driverVersion:
@@ -205,7 +205,7 @@ items:
205205
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
206206
capacity:
207207
memory: 80Gi
208-
name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
208+
name: gpu-6
209209
- basic:
210210
attributes:
211211
driverVersion:
@@ -218,7 +218,7 @@ items:
218218
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
219219
capacity:
220220
memory: 80Gi
221-
name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
221+
name: gpu-7
222222
kind: List
223223
metadata:
224224
resourceVersion: ""
@@ -261,9 +261,9 @@ for example in $(seq 1 5); do \
261261
for ctr in $(kubectl get pod -n gpu-test${example} ${pod} -o jsonpath='{.spec.containers[*].name}'); do \
262262
echo "${pod} ${ctr}:"
263263
if [ "${example}" -lt 3 ]; then
264-
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+="
264+
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+=" | grep -v "RESOURCE_CLAIM"
265265
else
266-
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+"
266+
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+" | grep -v "RESOURCE_CLAIM"
267267
fi
268268
done
269269
done
@@ -275,60 +275,60 @@ This should produce output similar to the following:
275275
```bash
276276
gpu-test1:
277277
pod0 ctr0:
278-
declare -x GPU_DEVICE_0="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744"
278+
declare -x GPU_DEVICE_6="gpu-6"
279279
pod1 ctr0:
280-
declare -x GPU_DEVICE_0="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243"
280+
declare -x GPU_DEVICE_7="gpu-7"
281281

282282
gpu-test2:
283283
pod0 ctr0:
284-
declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747"
285-
declare -x GPU_DEVICE_1="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e"
284+
declare -x GPU_DEVICE_0="gpu-0"
285+
declare -x GPU_DEVICE_1="gpu-1"
286286

287287
gpu-test3:
288288
pod0 ctr0:
289-
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
290-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
291-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
289+
declare -x GPU_DEVICE_2="gpu-2"
290+
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
291+
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"
292292
pod0 ctr1:
293-
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
294-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
295-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
293+
declare -x GPU_DEVICE_2="gpu-2"
294+
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
295+
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"
296296

297297
gpu-test4:
298298
pod0 ctr0:
299-
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
300-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
301-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
299+
declare -x GPU_DEVICE_3="gpu-3"
300+
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
301+
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"
302302
pod1 ctr0:
303-
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
304-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
305-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
303+
declare -x GPU_DEVICE_3="gpu-3"
304+
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
305+
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"
306306

307307
gpu-test5:
308308
pod0 ts-ctr0:
309-
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
310-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
311-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
309+
declare -x GPU_DEVICE_4="gpu-4"
310+
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
311+
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
312312
pod0 ts-ctr1:
313-
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
314-
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
315-
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
313+
declare -x GPU_DEVICE_4="gpu-4"
314+
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
315+
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
316316
pod0 sp-ctr0:
317-
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
318-
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
319-
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
317+
declare -x GPU_DEVICE_5="gpu-5"
318+
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
319+
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
320320
pod0 sp-ctr1:
321-
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
322-
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
323-
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
321+
declare -x GPU_DEVICE_5="gpu-5"
322+
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
323+
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
324324
```
325325

326326
In this example resource driver, no "actual" GPUs are made available to any
327327
containers. Instead, a set of environment variables are set in each container
328328
to indicate which GPUs *would* have been injected into them by a real resource
329329
driver and how they *would* have been configured.
330330

331-
You can use the UUIDs of the GPUs as well as the GPU sharing settings set in
331+
You can use the IDs of the GPUs as well as the GPU sharing settings set in
332332
these environment variables to verify that they were handed out in a way
333333
consistent with the semantics shown in the figure above.
334334

cmd/dra-example-kubeletplugin/cdi.go

+12-29
Original file line numberDiff line numberDiff line change
@@ -89,36 +89,19 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
8989
Devices: []cdispec.Device{},
9090
}
9191

92-
for i, device := range devices {
93-
envs := []string{
94-
fmt.Sprintf("GPU_DEVICE_%d=%s", i, device.DeviceName),
95-
}
96-
97-
if device.Config.Sharing != nil {
98-
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_SHARING_STRATEGY=%s", i, device.Config.Sharing.Strategy))
99-
}
100-
101-
switch {
102-
case device.Config.Sharing.IsTimeSlicing():
103-
tsconfig, err := device.Config.Sharing.GetTimeSlicingConfig()
104-
if err != nil {
105-
return fmt.Errorf("unable to get time slicing config for device %v: %v", device.DeviceName, err)
106-
}
107-
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_TIMESLICE_INTERVAL=%v", i, tsconfig.Interval))
108-
109-
case device.Config.Sharing.IsSpacePartitioning():
110-
spconfig, err := device.Config.Sharing.GetSpacePartitioningConfig()
111-
if err != nil {
112-
return fmt.Errorf("unable to get space partitioning config for device %v: %v", device.DeviceName, err)
113-
}
114-
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_PARTITION_COUNT=%v", i, spconfig.PartitionCount))
92+
for _, device := range devices {
93+
claimEdits := cdiapi.ContainerEdits{
94+
ContainerEdits: &cdispec.ContainerEdits{
95+
Env: []string{
96+
fmt.Sprintf("GPU_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName[4:], claimUID),
97+
},
98+
},
11599
}
100+
claimEdits.Append(device.ContainerEdits)
116101

117102
cdiDevice := cdispec.Device{
118-
Name: device.DeviceName,
119-
ContainerEdits: cdispec.ContainerEdits{
120-
Env: envs,
121-
},
103+
Name: fmt.Sprintf("%s-%s", claimUID, device.DeviceName),
104+
ContainerEdits: *claimEdits.ContainerEdits,
122105
}
123106

124107
spec.Devices = append(spec.Devices, cdiDevice)
@@ -138,13 +121,13 @@ func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error {
138121
return cdi.cache.RemoveSpec(specName)
139122
}
140123

141-
func (cdi *CDIHandler) GetClaimDevices(devices []string) []string {
124+
func (cdi *CDIHandler) GetClaimDevices(claimUID string, devices []string) []string {
142125
cdiDevices := []string{
143126
cdiparser.QualifiedName(cdiVendor, cdiClass, cdiCommonDeviceName),
144127
}
145128

146129
for _, device := range devices {
147-
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, device)
130+
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, fmt.Sprintf("%s-%s", claimUID, device))
148131
cdiDevices = append(cdiDevices, cdiDevice)
149132
}
150133

cmd/dra-example-kubeletplugin/discovery.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package main
1818

1919
import (
20+
"fmt"
2021
"math/rand"
2122
"os"
2223

@@ -35,7 +36,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
3536
alldevices := make(AllocatableDevices)
3637
for i, uuid := range uuids {
3738
device := resourceapi.Device{
38-
Name: uuid,
39+
Name: fmt.Sprintf("gpu-%d", i),
3940
Basic: &resourceapi.BasicDevice{
4041
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
4142
"index": {
@@ -56,7 +57,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
5657
},
5758
},
5859
}
59-
alldevices[uuid] = device
60+
alldevices[device.Name] = device
6061
}
6162
return alldevices, nil
6263
}

0 commit comments

Comments
 (0)