Skip to content

Commit 73f576c

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: fix cgroup creation failure after many small jobs
The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [[email protected]: init the IDR] Link: http://lkml.kernel.org/r/[email protected] Fixes: b205256 ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Johannes Weiner <[email protected]> Reported-by: John Garcia <[email protected]> Reviewed-by: Vladimir Davydov <[email protected]> Acked-by: Tejun Heo <[email protected]> Cc: Nikolay Borisov <[email protected]> Cc: <[email protected]> [3.19+] Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 47ef4ad commit 73f576c

File tree

3 files changed

+87
-24
lines changed

3 files changed

+87
-24
lines changed

include/linux/memcontrol.h

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ enum mem_cgroup_events_target {
9797
#define MEM_CGROUP_ID_SHIFT 16
9898
#define MEM_CGROUP_ID_MAX USHRT_MAX
9999

100+
struct mem_cgroup_id {
101+
int id;
102+
atomic_t ref;
103+
};
104+
100105
struct mem_cgroup_stat_cpu {
101106
long count[MEMCG_NR_STAT];
102107
unsigned long events[MEMCG_NR_EVENTS];
@@ -172,6 +177,9 @@ enum memcg_kmem_state {
172177
struct mem_cgroup {
173178
struct cgroup_subsys_state css;
174179

180+
/* Private memcg ID. Used to ID objects that outlive the cgroup */
181+
struct mem_cgroup_id id;
182+
175183
/* Accounted resources */
176184
struct page_counter memory;
177185
struct page_counter swap;
@@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
330338
if (mem_cgroup_disabled())
331339
return 0;
332340

333-
return memcg->css.id;
334-
}
335-
336-
/**
337-
* mem_cgroup_from_id - look up a memcg from an id
338-
* @id: the id to look up
339-
*
340-
* Caller must hold rcu_read_lock() and use css_tryget() as necessary.
341-
*/
342-
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
343-
{
344-
struct cgroup_subsys_state *css;
345-
346-
css = css_from_id(id, &memory_cgrp_subsys);
347-
return mem_cgroup_from_css(css);
341+
return memcg->id.id;
348342
}
343+
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
349344

350345
/**
351346
* parent_mem_cgroup - find the accounting parent of a memcg

mm/memcontrol.c

Lines changed: 75 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4057,6 +4057,60 @@ static struct cftype mem_cgroup_legacy_files[] = {
40574057
{ }, /* terminate */
40584058
};
40594059

4060+
/*
4061+
* Private memory cgroup IDR
4062+
*
4063+
* Swap-out records and page cache shadow entries need to store memcg
4064+
* references in constrained space, so we maintain an ID space that is
4065+
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4066+
* memory-controlled cgroups to 64k.
4067+
*
4068+
* However, there usually are many references to the oflline CSS after
4069+
* the cgroup has been destroyed, such as page cache or reclaimable
4070+
* slab objects, that don't need to hang on to the ID. We want to keep
4071+
* those dead CSS from occupying IDs, or we might quickly exhaust the
4072+
* relatively small ID space and prevent the creation of new cgroups
4073+
* even when there are much fewer than 64k cgroups - possibly none.
4074+
*
4075+
* Maintain a private 16-bit ID space for memcg, and allow the ID to
4076+
* be freed and recycled when it's no longer needed, which is usually
4077+
* when the CSS is offlined.
4078+
*
4079+
* The only exception to that are records of swapped out tmpfs/shmem
4080+
* pages that need to be attributed to live ancestors on swapin. But
4081+
* those references are manageable from userspace.
4082+
*/
4083+
4084+
static DEFINE_IDR(mem_cgroup_idr);
4085+
4086+
static void mem_cgroup_id_get(struct mem_cgroup *memcg)
4087+
{
4088+
atomic_inc(&memcg->id.ref);
4089+
}
4090+
4091+
static void mem_cgroup_id_put(struct mem_cgroup *memcg)
4092+
{
4093+
if (atomic_dec_and_test(&memcg->id.ref)) {
4094+
idr_remove(&mem_cgroup_idr, memcg->id.id);
4095+
memcg->id.id = 0;
4096+
4097+
/* Memcg ID pins CSS */
4098+
css_put(&memcg->css);
4099+
}
4100+
}
4101+
4102+
/**
4103+
* mem_cgroup_from_id - look up a memcg from a memcg id
4104+
* @id: the memcg id to look up
4105+
*
4106+
* Caller must hold rcu_read_lock().
4107+
*/
4108+
struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4109+
{
4110+
WARN_ON_ONCE(!rcu_read_lock_held());
4111+
return idr_find(&mem_cgroup_idr, id);
4112+
}
4113+
40604114
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
40614115
{
40624116
struct mem_cgroup_per_node *pn;
@@ -4116,6 +4170,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
41164170
if (!memcg)
41174171
return NULL;
41184172

4173+
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4174+
1, MEM_CGROUP_ID_MAX,
4175+
GFP_KERNEL);
4176+
if (memcg->id.id < 0)
4177+
goto fail;
4178+
41194179
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
41204180
if (!memcg->stat)
41214181
goto fail;
@@ -4142,8 +4202,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
41424202
#ifdef CONFIG_CGROUP_WRITEBACK
41434203
INIT_LIST_HEAD(&memcg->cgwb_list);
41444204
#endif
4205+
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
41454206
return memcg;
41464207
fail:
4208+
if (memcg->id.id > 0)
4209+
idr_remove(&mem_cgroup_idr, memcg->id.id);
41474210
mem_cgroup_free(memcg);
41484211
return NULL;
41494212
}
@@ -4206,12 +4269,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
42064269
return ERR_PTR(-ENOMEM);
42074270
}
42084271

4209-
static int
4210-
mem_cgroup_css_online(struct cgroup_subsys_state *css)
4272+
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
42114273
{
4212-
if (css->id > MEM_CGROUP_ID_MAX)
4213-
return -ENOSPC;
4214-
4274+
/* Online state pins memcg ID, memcg ID pins CSS */
4275+
mem_cgroup_id_get(mem_cgroup_from_css(css));
4276+
css_get(css);
42154277
return 0;
42164278
}
42174279

@@ -4234,6 +4296,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
42344296

42354297
memcg_offline_kmem(memcg);
42364298
wb_memcg_offline(memcg);
4299+
4300+
mem_cgroup_id_put(memcg);
42374301
}
42384302

42394303
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -5756,6 +5820,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
57565820
if (!memcg)
57575821
return;
57585822

5823+
mem_cgroup_id_get(memcg);
57595824
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
57605825
VM_BUG_ON_PAGE(oldid, page);
57615826
mem_cgroup_swap_statistics(memcg, true);
@@ -5774,6 +5839,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
57745839
VM_BUG_ON(!irqs_disabled());
57755840
mem_cgroup_charge_statistics(memcg, page, false, -1);
57765841
memcg_check_events(memcg, page);
5842+
5843+
if (!mem_cgroup_is_root(memcg))
5844+
css_put(&memcg->css);
57775845
}
57785846

57795847
/*
@@ -5804,11 +5872,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
58045872
!page_counter_try_charge(&memcg->swap, 1, &counter))
58055873
return -ENOMEM;
58065874

5875+
mem_cgroup_id_get(memcg);
58075876
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
58085877
VM_BUG_ON_PAGE(oldid, page);
58095878
mem_cgroup_swap_statistics(memcg, true);
58105879

5811-
css_get(&memcg->css);
58125880
return 0;
58135881
}
58145882

@@ -5837,7 +5905,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
58375905
page_counter_uncharge(&memcg->memsw, 1);
58385906
}
58395907
mem_cgroup_swap_statistics(memcg, false);
5840-
css_put(&memcg->css);
5908+
mem_cgroup_id_put(memcg);
58415909
}
58425910
rcu_read_unlock();
58435911
}

mm/slab_common.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
526526
goto out_unlock;
527527

528528
cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
529-
cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
530-
css->id, memcg_name_buf);
529+
cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
530+
css->serial_nr, memcg_name_buf);
531531
if (!cache_name)
532532
goto out_unlock;
533533

0 commit comments

Comments
 (0)