Skip to content

Commit df3fa11

Browse files
Yuanhong Pengcrosbymichael
Yuanhong Peng
authored andcommitted
Add support for cgroup namespace
Cgroup namespace can be configured in `config.json` as other namespaces. Here is an example: ``` "namespaces": [ { "type": "pid" }, { "type": "network" }, { "type": "ipc" }, { "type": "uts" }, { "type": "mount" }, { "type": "cgroup" } ], ``` Note that if you want to run a container which has shared cgroup ns with another container, then it's strongly recommended that you set proper `CgroupsPath` of both containers(the second container's cgroup path must be the subdirectory of the first one). Or there might be some unexpected results. Signed-off-by: Yuanhong Peng <[email protected]> Signed-off-by: Michael Crosby <[email protected]>
1 parent 9a3a8a5 commit df3fa11

File tree

10 files changed

+216
-75
lines changed

10 files changed

+216
-75
lines changed

libcontainer/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ config := &configs.Config{
148148
{Type: configs.NEWPID},
149149
{Type: configs.NEWUSER},
150150
{Type: configs.NEWNET},
151+
{Type: configs.NEWCGROUP},
151152
}),
152153
Cgroups: &configs.Cgroup{
153154
Name: "test-container",

libcontainer/SPEC.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,17 @@ Minimum requirements:
2121

2222
### Namespaces
2323

24-
| Flag | Enabled |
25-
| ------------ | ------- |
26-
| CLONE_NEWPID | 1 |
27-
| CLONE_NEWUTS | 1 |
28-
| CLONE_NEWIPC | 1 |
29-
| CLONE_NEWNET | 1 |
30-
| CLONE_NEWNS | 1 |
31-
| CLONE_NEWUSER | 1 |
32-
33-
Namespaces are created for the container via the `clone` syscall.
24+
| Flag | Enabled |
25+
| --------------- | ------- |
26+
| CLONE_NEWPID | 1 |
27+
| CLONE_NEWUTS | 1 |
28+
| CLONE_NEWIPC | 1 |
29+
| CLONE_NEWNET | 1 |
30+
| CLONE_NEWNS | 1 |
31+
| CLONE_NEWUSER | 1 |
32+
| CLONE_NEWCGROUP | 1 |
33+
34+
Namespaces are created for the container via the `unshare` syscall.
3435

3536

3637
### Filesystem

libcontainer/cgroups/utils.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
)
1818

1919
const (
20-
cgroupNamePrefix = "name="
20+
CgroupNamePrefix = "name="
2121
CgroupProcesses = "cgroup.procs"
2222
)
2323

@@ -156,8 +156,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
156156
continue
157157
}
158158
ss[opt] = true
159-
if strings.HasPrefix(opt, cgroupNamePrefix) {
160-
opt = opt[len(cgroupNamePrefix):]
159+
if strings.HasPrefix(opt, CgroupNamePrefix) {
160+
opt = opt[len(CgroupNamePrefix):]
161161
}
162162
m.Subsystems = append(m.Subsystems, opt)
163163
numFound++
@@ -343,7 +343,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
343343
return p, nil
344344
}
345345

346-
if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
346+
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
347347
return p, nil
348348
}
349349

libcontainer/configs/namespaces_syscall.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,14 @@ func (n *Namespace) Syscall() int {
88
return namespaceInfo[n.Type]
99
}
1010

11-
// This is not yet in the Go stdlib.
12-
const syscall_CLONE_NEWCGROUP = (1 << 29)
13-
1411
var namespaceInfo = map[NamespaceType]int{
1512
NEWNET: unix.CLONE_NEWNET,
1613
NEWNS: unix.CLONE_NEWNS,
1714
NEWUSER: unix.CLONE_NEWUSER,
1815
NEWIPC: unix.CLONE_NEWIPC,
1916
NEWUTS: unix.CLONE_NEWUTS,
2017
NEWPID: unix.CLONE_NEWPID,
21-
NEWCGROUP: syscall_CLONE_NEWCGROUP,
18+
NEWCGROUP: unix.CLONE_NEWCGROUP,
2219
}
2320

2421
// CloneFlags parses the container's Namespaces options to set the correct

libcontainer/configs/validate/validator.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
3838
if err := v.usernamespace(config); err != nil {
3939
return err
4040
}
41+
if err := v.cgroupnamespace(config); err != nil {
42+
return err
43+
}
4144
if err := v.sysctl(config); err != nil {
4245
return err
4346
}
@@ -116,6 +119,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
116119
return nil
117120
}
118121

122+
func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
123+
if config.Namespaces.Contains(configs.NEWCGROUP) {
124+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
125+
return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
126+
}
127+
}
128+
return nil
129+
}
130+
119131
// sysctl validates that the specified sysctl keys are valid or not.
120132
// /proc/sys isn't completely namespaced and depending on which namespaces
121133
// are specified, a subset of sysctls are permitted.

libcontainer/container_linux.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1745,7 +1745,6 @@ func (c *linuxContainer) currentState() (*State, error) {
17451745
// can setns in order.
17461746
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
17471747
paths := []string{}
1748-
17491748
for _, ns := range configs.NamespaceTypes() {
17501749

17511750
// Remove namespaces that we don't need to join.

libcontainer/integration/exec_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,3 +1776,60 @@ func TestTmpfsCopyUp(t *testing.T) {
17761776
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
17771777
}
17781778
}
1779+
1780+
func TestCGROUPPrivate(t *testing.T) {
1781+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
1782+
t.Skip("cgroupns is unsupported")
1783+
}
1784+
if testing.Short() {
1785+
return
1786+
}
1787+
1788+
rootfs, err := newRootfs()
1789+
ok(t, err)
1790+
defer remove(rootfs)
1791+
1792+
l, err := os.Readlink("/proc/1/ns/cgroup")
1793+
ok(t, err)
1794+
1795+
config := newTemplateConfig(rootfs)
1796+
config.Namespaces.Add(configs.NEWCGROUP, "")
1797+
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
1798+
ok(t, err)
1799+
1800+
if exitCode != 0 {
1801+
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
1802+
}
1803+
1804+
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
1805+
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
1806+
}
1807+
}
1808+
1809+
func TestCGROUPHost(t *testing.T) {
1810+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
1811+
t.Skip("cgroupns is unsupported")
1812+
}
1813+
if testing.Short() {
1814+
return
1815+
}
1816+
1817+
rootfs, err := newRootfs()
1818+
ok(t, err)
1819+
defer remove(rootfs)
1820+
1821+
l, err := os.Readlink("/proc/1/ns/cgroup")
1822+
ok(t, err)
1823+
1824+
config := newTemplateConfig(rootfs)
1825+
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
1826+
ok(t, err)
1827+
1828+
if exitCode != 0 {
1829+
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
1830+
}
1831+
1832+
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
1833+
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
1834+
}
1835+
}

libcontainer/nsenter/nsexec.c

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ enum sync_t {
4242
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
4343
};
4444

45+
/*
46+
* Synchronisation value for cgroup namespace setup.
47+
* The same constant is defined in process_linux.go as "createCgroupns".
48+
*/
49+
#define CREATECGROUPNS 0x80
50+
4551
/* longjmp() arguments. */
4652
#define JUMP_PARENT 0x00
4753
#define JUMP_CHILD 0xA0
@@ -640,7 +646,6 @@ void nsexec(void)
640646
case JUMP_PARENT:{
641647
int len;
642648
pid_t child, first_child = -1;
643-
char buf[JSON_MAX];
644649
bool ready = false;
645650

646651
/* For debugging. */
@@ -716,6 +721,18 @@ void nsexec(void)
716721
kill(child, SIGKILL);
717722
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
718723
}
724+
725+
/* Send the init_func pid back to our parent.
726+
*
727+
* Send the init_func pid and the pid of the first child back to our parent.
728+
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
729+
* It becomes the responsibility of our parent to reap the first child.
730+
*/
731+
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
732+
if (len < 0) {
733+
kill(child, SIGKILL);
734+
bail("unable to generate JSON for child pid");
735+
}
719736
}
720737
break;
721738
case SYNC_CHILD_READY:
@@ -759,23 +776,6 @@ void nsexec(void)
759776
bail("unexpected sync value: %u", s);
760777
}
761778
}
762-
763-
/*
764-
* Send the init_func pid and the pid of the first child back to our parent.
765-
*
766-
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
767-
* It becomes the responsibility of our parent to reap the first child.
768-
*/
769-
len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
770-
if (len < 0) {
771-
kill(child, SIGKILL);
772-
bail("unable to generate JSON for child pid");
773-
}
774-
if (write(pipenum, buf, len) != len) {
775-
kill(child, SIGKILL);
776-
bail("unable to send child pid to bootstrapper");
777-
}
778-
779779
exit(0);
780780
}
781781

@@ -862,14 +862,17 @@ void nsexec(void)
862862
if (setresuid(0, 0, 0) < 0)
863863
bail("failed to become root in user namespace");
864864
}
865-
866865
/*
867-
* Unshare all of the namespaces. Note that we don't merge this
868-
* with clone() because there were some old kernel versions where
869-
* clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do
870-
* it the long way.
866+
* Unshare all of the namespaces. Now, it should be noted that this
867+
* ordering might break in the future (especially with rootless
868+
* containers). But for now, it's not possible to split this into
869+
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
870+
*
871+
* Note that we don't merge this with clone() because there were
872+
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
873+
* was broken, so we'll just do it the long way anyway.
871874
*/
872-
if (unshare(config.cloneflags) < 0)
875+
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
873876
bail("failed to unshare namespaces");
874877

875878
/*
@@ -958,6 +961,18 @@ void nsexec(void)
958961
bail("setgroups failed");
959962
}
960963

964+
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
965+
if (config.cloneflags & CLONE_NEWCGROUP) {
966+
uint8_t value;
967+
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
968+
bail("read synchronisation value failed");
969+
if (value == CREATECGROUPNS) {
970+
if (unshare(CLONE_NEWCGROUP) < 0)
971+
bail("failed to unshare cgroup namespace");
972+
} else
973+
bail("received unknown synchronisation value");
974+
}
975+
961976
s = SYNC_CHILD_READY;
962977
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
963978
bail("failed to sync with patent: write(SYNC_CHILD_READY)");

libcontainer/process_linux.go

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ import (
2222
"golang.org/x/sys/unix"
2323
)
2424

25+
// Synchronisation value for cgroup namespace setup.
26+
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
27+
const createCgroupns = 0x80
28+
2529
type parentProcess interface {
2630
// pid returns the pid for the running process.
2731
pid() int
@@ -225,12 +229,17 @@ func (p *initProcess) externalDescriptors() []string {
225229
return p.fds
226230
}
227231

228-
// execSetns runs the process that executes C code to perform the setns calls
229-
// because setns support requires the C process to fork off a child and perform the setns
230-
// before the go runtime boots, we wait on the process to die and receive the child's pid
231-
// over the provided pipe.
232-
// This is called by initProcess.start function
233-
func (p *initProcess) execSetns() error {
232+
// getChildPid receives the final child's pid over the provided pipe.
233+
func (p *initProcess) getChildPid() (int, error) {
234+
var pid pid
235+
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
236+
p.cmd.Wait()
237+
return -1, err
238+
}
239+
return pid.Pid, nil
240+
}
241+
242+
func (p *initProcess) waitForChildExit(childPid int) error {
234243
status, err := p.cmd.Process.Wait()
235244
if err != nil {
236245
p.cmd.Wait()
@@ -240,22 +249,8 @@ func (p *initProcess) execSetns() error {
240249
p.cmd.Wait()
241250
return &exec.ExitError{ProcessState: status}
242251
}
243-
var pid *pid
244-
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
245-
p.cmd.Wait()
246-
return err
247-
}
248-
249-
// Clean up the zombie parent process
250-
firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
251-
if err != nil {
252-
return err
253-
}
254-
255-
// Ignore the error in case the child has already been reaped for any reason
256-
_, _ = firstChildProcess.Wait()
257252

258-
process, err := os.FindProcess(pid.Pid)
253+
process, err := os.FindProcess(childPid)
259254
if err != nil {
260255
return err
261256
}
@@ -297,19 +292,47 @@ func (p *initProcess) start() error {
297292
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
298293
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
299294
}
300-
301-
if err := p.execSetns(); err != nil {
302-
return newSystemErrorWithCause(err, "running exec setns process for init")
295+
childPid, err := p.getChildPid()
296+
if err != nil {
297+
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
303298
}
304299

305300
// Save the standard descriptor names before the container process
306301
// can potentially move them (e.g., via dup2()). If we don't do this now,
307302
// we won't know at checkpoint time which file descriptor to look up.
308-
fds, err := getPipeFds(p.pid())
303+
fds, err := getPipeFds(childPid)
309304
if err != nil {
310-
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
305+
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
311306
}
312307
p.setExternalDescriptors(fds)
308+
// Do this before syncing with child so that no children
309+
// can escape the cgroup
310+
if err := p.manager.Apply(childPid); err != nil {
311+
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
312+
}
313+
if p.intelRdtManager != nil {
314+
if err := p.intelRdtManager.Apply(childPid); err != nil {
315+
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
316+
}
317+
}
318+
// Now it's time to setup cgroup namesapce
319+
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
320+
if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
321+
return newSystemErrorWithCause(err, "sending synchronization value to init process")
322+
}
323+
}
324+
325+
// Wait for our first child to exit
326+
if err := p.waitForChildExit(childPid); err != nil {
327+
return newSystemErrorWithCause(err, "waiting for our first child to exit")
328+
}
329+
330+
defer func() {
331+
if err != nil {
332+
// TODO: should not be the responsibility to call here
333+
p.manager.Destroy()
334+
}
335+
}()
313336
if err := p.createNetworkInterfaces(); err != nil {
314337
return newSystemErrorWithCause(err, "creating network interfaces")
315338
}

0 commit comments

Comments
 (0)