Page allocator will only alloc pages on node indicated by `nodemask`. But oom will still select bad process by total rss usage which may reclam nothing on the node indicated by `nodemask`. This patch let oom only calculate rss on the given node when oc->constraint equals to CONSTRAINT_MEMORY_POLICY. If `nodemask` is asigned, the process with the highest memory consumption on the specific node will be killed. oom_kill dmesg will looks like this: ``` [ 1471.436027] Tasks state (memory values in pages): [ 1471.438518] [ pid ] uid tgid total_vm rss (01)nrss pgtables_bytes swapents oom_score_adj name [ 1471.554703] [ 1011] 0 1011 220005 8589 1872 823296 0 0 node [ 1471.707912] [ 12399] 0 12399 1311306 1311056 262170 10534912 0 0 a.out [ 1471.712429] [ 13135] 0 13135 787018 674666 674300 5439488 0 0 a.out [ 1471.721506] [ 13295] 0 13295 597 188 0 24576 0 0 sh [ 1471.734600] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=1,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/user.slice/user-0.slice/session-3.scope,task=a.out,pid=13135,uid=0 [ 1471.742583] Out of memory: Killed process 13135 (a.out) total-vm:3148072kB, anon-rss:2697304kB, file-rss:1360kB, shmem-rss:0kB, UID:0 pgtables:5312kB oom_score_adj:0 [ 1471.849615] oom_reaper: reaped process 13135 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB ``` Signed-off-by: Gang Li <ligang.bdlg@xxxxxxxxxxxxx> --- fs/proc/base.c | 6 +++++- include/linux/oom.h | 2 +- mm/oom_kill.c | 45 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index c1031843cc6a..caf0f51284d0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -552,8 +552,12 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; + struct oom_control oc = { + .totalpages = totalpages, + .gfp_mask = 0, + }; - badness = oom_badness(task, totalpages); + badness = oom_badness(task, &oc); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..0cb6a60be776 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -109,7 +109,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); long oom_badness(struct task_struct *p, - unsigned long totalpages); + struct oom_control *oc); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 757f5665ae94..75a80b5a63bf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -198,7 +198,7 @@ static bool should_dump_unreclaim_slab(void) * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -long oom_badness(struct task_struct *p, unsigned long totalpages) +long oom_badness(struct task_struct *p, struct oom_control *oc) { long points; long adj; @@ -227,12 +227,22 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) + - mm_pgtables_bytes(p->mm) / PAGE_SIZE; + if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) { + struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask), + oc->nodemask); + int nid_to_find_victim = zone_to_nid(zoneref->zone); + + points = get_mm_counter(p->mm, -1, nid_to_find_victim) + + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) + + mm_pgtables_bytes(p->mm) / PAGE_SIZE; + } else { + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) + + mm_pgtables_bytes(p->mm) / PAGE_SIZE; + } task_unlock(p); /* Normalize to oom_score_adj units */ - adj *= totalpages / 1000; + adj *= oc->totalpages / 1000; points += adj; return points; @@ -338,7 +348,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, oc->totalpages); + points = oom_badness(task, oc); if (points == LONG_MIN || points < oc->chosen_points) goto next; @@ -382,6 +392,7 @@ static int dump_task(struct task_struct *p, void *arg) { struct oom_control *oc = arg; struct task_struct *task; + unsigned long node_mm_rss; if (oom_unkillable_task(p)) return 0; @@ -399,9 +410,18 @@ static int dump_task(struct task_struct *p, void *arg) return 0; } - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) { + struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask), + oc->nodemask); + int nid_to_find_victim = zone_to_nid(zoneref->zone); + + node_mm_rss = get_mm_counter(p->mm, -1, nid_to_find_victim); + } else { + node_mm_rss = 0; + } + pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), node_mm_rss, mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE), task->signal->oom_score_adj, task->comm); @@ -422,8 +442,17 @@ static int dump_task(struct task_struct *p, void *arg) */ static void dump_tasks(struct oom_control *oc) { + int nid_to_find_victim; + + if (oc->nodemask) { + struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask), + oc->nodemask); + nid_to_find_victim = zone_to_nid(zoneref->zone); + } else { + nid_to_find_victim = -1; + } pr_info("Tasks state (memory values in pages):\n"); - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss (%02d)nrss pgtables_bytes swapents oom_score_adj name\n", nid_to_find_victim); if (is_memcg_oom(oc)) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); -- 2.20.1