Skip to content

Commit ac311a1

Browse files
shakeelbtorvalds
authored andcommitted
oom: decouple mems_allowed from oom_unkillable_task
Commit ef08e3b ("[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset") introduces a heuristic where a potential oom-killer victim is skipped if the intersection of the potential victim and the current (the process triggered the oom) is empty based on the reason that killing such victim most probably will not help the current allocating process. However the commit 7887a3d ("[PATCH] oom: cpuset hint") changed the heuristic to just decrease the oom_badness scores of such potential victim based on the reason that the cpuset of such processes might have changed and previously they may have allocated memory on mems where the current allocating process can allocate from. Unintentionally 7887a3d ("[PATCH] oom: cpuset hint") introduced a side effect as the oom_badness is also exposed to the user space through /proc/[pid]/oom_score, so, readers with different cpusets can read different oom_score of the same process. Later, commit 6cf86ac ("oom: filter tasks not sharing the same cpuset") fixed the side effect introduced by 7887a3d by moving the cpuset intersection back to only oom-killer context and out of oom_badness. However the combination of ab290ad ("oom: make oom_unkillable_task() helper function") and 26ebc98 ("oom: /proc/<pid>/oom_score treat kernel thread honestly") unintentionally brought back the cpuset intersection check into the oom_badness calculation function. Other than doing cpuset/mempolicy intersection from oom_badness, the memcg oom context is also doing cpuset/mempolicy intersection which is quite wrong and is caught by syzcaller with the following report: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321 mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169 select_bad_process mm/oom_kill.c:374 [inline] out_of_memory mm/oom_kill.c:1088 [inline] out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035 mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573 mem_cgroup_oom mm/memcontrol.c:1905 [inline] try_charge+0xfbe/0x1480 mm/memcontrol.c:2468 mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073 mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088 do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201 do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359 wp_huge_pmd mm/memory.c:3793 [inline] __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006 handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053 do_user_addr_fault arch/x86/mm/fault.c:1455 [inline] __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521 do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552 page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156 RIP: 0033:0x400590 Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48 8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01 00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b RSP: 002b:00007fff7bc49780 EFLAGS: 00010206 RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001 RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008 R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0 Modules linked in: ---[ end trace a65689219582ffff ]--- RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 The fix is to decouple the cpuset/mempolicy intersection check from oom_unkillable_task() and make sure cpuset/mempolicy intersection check is only done in the global oom context. [[email protected]: change function name and update comment] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Shakeel Butt <[email protected]> Reported-by: [email protected] Acked-by: Roman Gushchin <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: David Rientjes <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: KOSAKI Motohiro <[email protected]> Cc: Nick Piggin <[email protected]> Cc: Paul Jackson <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Vladimir Davydov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 6ba749e commit ac311a1

File tree

3 files changed

+33
-28
lines changed

3 files changed

+33
-28
lines changed

fs/proc/base.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
532532
unsigned long totalpages = totalram_pages() + total_swap_pages;
533533
unsigned long points = 0;
534534

535-
points = oom_badness(task, NULL, totalpages) *
536-
1000 / totalpages;
535+
points = oom_badness(task, totalpages) * 1000 / totalpages;
537536
seq_printf(m, "%lu\n", points);
538537

539538
return 0;

include/linux/oom.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
108108
bool __oom_reap_task_mm(struct mm_struct *mm);
109109

110110
extern unsigned long oom_badness(struct task_struct *p,
111-
const nodemask_t *nodemask,
112111
unsigned long totalpages);
113112

114113
extern bool out_of_memory(struct oom_control *oc);

mm/oom_kill.c

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1;
6464
*/
6565
DEFINE_MUTEX(oom_lock);
6666

67+
static inline bool is_memcg_oom(struct oom_control *oc)
68+
{
69+
return oc->memcg != NULL;
70+
}
71+
6772
#ifdef CONFIG_NUMA
6873
/**
69-
* has_intersects_mems_allowed() - check task eligiblity for kill
74+
* oom_cpuset_eligible() - check task eligiblity for kill
7075
* @start: task struct of which task to consider
7176
* @mask: nodemask passed to page allocator for mempolicy ooms
7277
*
7378
* Task eligibility is determined by whether or not a candidate task, @tsk,
7479
* shares the same mempolicy nodes as current if it is bound by such a policy
7580
* and whether or not it has the same set of allowed cpuset nodes.
81+
*
82+
* This function is assuming oom-killer context and 'current' has triggered
83+
* the oom-killer.
7684
*/
77-
static bool has_intersects_mems_allowed(struct task_struct *start,
78-
const nodemask_t *mask)
85+
static bool oom_cpuset_eligible(struct task_struct *start,
86+
struct oom_control *oc)
7987
{
8088
struct task_struct *tsk;
8189
bool ret = false;
90+
const nodemask_t *mask = oc->nodemask;
91+
92+
if (is_memcg_oom(oc))
93+
return true;
8294

8395
rcu_read_lock();
8496
for_each_thread(start, tsk) {
@@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start,
105117
return ret;
106118
}
107119
#else
108-
static bool has_intersects_mems_allowed(struct task_struct *tsk,
109-
const nodemask_t *mask)
120+
static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
110121
{
111122
return true;
112123
}
@@ -146,24 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
146157
return oc->order == -1;
147158
}
148159

149-
static inline bool is_memcg_oom(struct oom_control *oc)
150-
{
151-
return oc->memcg != NULL;
152-
}
153-
154160
/* return true if the task is not adequate as candidate victim task. */
155-
static bool oom_unkillable_task(struct task_struct *p,
156-
const nodemask_t *nodemask)
161+
static bool oom_unkillable_task(struct task_struct *p)
157162
{
158163
if (is_global_init(p))
159164
return true;
160165
if (p->flags & PF_KTHREAD)
161166
return true;
162-
163-
/* p may not have freeable memory in nodemask */
164-
if (!has_intersects_mems_allowed(p, nodemask))
165-
return true;
166-
167167
return false;
168168
}
169169

@@ -190,19 +190,17 @@ static bool is_dump_unreclaim_slabs(void)
190190
* oom_badness - heuristic function to determine which candidate task to kill
191191
* @p: task struct of which task we should calculate
192192
* @totalpages: total present RAM allowed for page allocation
193-
* @nodemask: nodemask passed to page allocator for mempolicy ooms
194193
*
195194
* The heuristic for determining which task to kill is made to be as simple and
196195
* predictable as possible. The goal is to return the highest value for the
197196
* task consuming the most memory to avoid subsequent oom failures.
198197
*/
199-
unsigned long oom_badness(struct task_struct *p,
200-
const nodemask_t *nodemask, unsigned long totalpages)
198+
unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
201199
{
202200
long points;
203201
long adj;
204202

205-
if (oom_unkillable_task(p, nodemask))
203+
if (oom_unkillable_task(p))
206204
return 0;
207205

208206
p = find_lock_task_mm(p);
@@ -313,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
313311
struct oom_control *oc = arg;
314312
unsigned long points;
315313

316-
if (oom_unkillable_task(task, oc->nodemask))
314+
if (oom_unkillable_task(task))
315+
goto next;
316+
317+
/* p may not have freeable memory in nodemask */
318+
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
317319
goto next;
318320

319321
/*
@@ -337,7 +339,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
337339
goto select;
338340
}
339341

340-
points = oom_badness(task, oc->nodemask, oc->totalpages);
342+
points = oom_badness(task, oc->totalpages);
341343
if (!points || points < oc->chosen_points)
342344
goto next;
343345

@@ -382,7 +384,11 @@ static int dump_task(struct task_struct *p, void *arg)
382384
struct oom_control *oc = arg;
383385
struct task_struct *task;
384386

385-
if (oom_unkillable_task(p, oc->nodemask))
387+
if (oom_unkillable_task(p))
388+
return 0;
389+
390+
/* p may not have freeable memory in nodemask */
391+
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
386392
return 0;
387393

388394
task = find_lock_task_mm(p);
@@ -1079,7 +1085,8 @@ bool out_of_memory(struct oom_control *oc)
10791085
check_panic_on_oom(oc);
10801086

10811087
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1082-
current->mm && !oom_unkillable_task(current, oc->nodemask) &&
1088+
current->mm && !oom_unkillable_task(current) &&
1089+
oom_cpuset_eligible(current, oc) &&
10831090
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
10841091
get_task_struct(current);
10851092
oc->chosen = current;

0 commit comments

Comments
 (0)