On Tue 12-07-16 22:29:18, Tetsuo Handa wrote: > Currently, we walk process list in order to find existing TIF_MEMDIE > threads. But if we remember list of mm_struct used by TIF_MEMDIE threads, > we can avoid walking process list. Later patch in the series will change > OOM reaper to use list of mm_struct introduced by this patch. > > Also, we can start using TIF_MEMDIE only for the access to memory reserves > to oom victims which actually need to allocate and decouple the current > double meaning. Later patch in the series will eliminate OOM_SCAN_ABORT > case and "struct signal_struct"->oom_victims because oom_has_pending_mm() > introduced by this patch can take that role. > > It is theoretically possible that the number of elements on this list > grows as many as the number of configured memcg groups and > mempolicy/cpuset patterns. But in most cases, the OOM reaper will > immediately remove almost all elements in first OOM reap attempt. > Moreover, many of OOM events can be solved before the OOM reaper tries > to OOM reap that memory. Therefore, the average speed of removing > elements will be faster than the average speed of adding elements. > If delay caused by retrying specific element for one second before > trying other elements matters, we can do parallel OOM reaping. > > Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> OK, this looks better. The amount of added code seems quite large but follow up patches will remove quite some code so it looks like worthwhile. I am still not 100% decided whether this approach is better than signal_struct->oom_mm one because the later one. Both have some pros and cons but this approach is definitely sane and reasonable so Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/mm_types.h | 7 +++++ > include/linux/oom.h | 3 +++ > kernel/fork.c | 2 ++ > mm/memcontrol.c | 5 ++++ > mm/oom_kill.c | 69 +++++++++++++++++++++++++++++++++++++++++------- > 5 files changed, 77 insertions(+), 9 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 79472b2..96a8709 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -392,6 +392,12 @@ struct mm_rss_stat { > atomic_long_t count[NR_MM_COUNTERS]; > }; > > +struct oom_mm { > + struct list_head list; /* Linked to oom_mm_list list. */ > + /* Thread which was passed to mark_oom_victim() for the last time. */ > + struct task_struct *victim; > +}; > + > struct kioctx_table; > struct mm_struct { > struct vm_area_struct *mmap; /* list of VMAs */ > @@ -515,6 +521,7 @@ struct mm_struct { > #ifdef CONFIG_HUGETLB_PAGE > atomic_long_t hugetlb_usage; > #endif > + struct oom_mm oom_mm; > #ifdef CONFIG_MMU > struct work_struct async_put_work; > #endif > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 5bc0457..bdcb331 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -91,6 +91,9 @@ extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, > extern void check_panic_on_oom(struct oom_control *oc, > enum oom_constraint constraint); > > +extern void exit_oom_mm(struct mm_struct *mm); > +extern bool oom_has_pending_mm(struct mem_cgroup *memcg, > + const nodemask_t *nodemask); > extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, > struct task_struct *task); > > diff --git a/kernel/fork.c b/kernel/fork.c > index 7926993..d83163a 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -722,6 +722,8 @@ static inline void __mmput(struct mm_struct *mm) > } > if (mm->binfmt) > module_put(mm->binfmt->module); > + if (mm->oom_mm.victim) > + exit_oom_mm(mm); > mmdrop(mm); > } > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 40dfca3..8f7a5b7 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -1241,6 +1241,11 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, > } > > check_panic_on_oom(&oc, CONSTRAINT_MEMCG); > + if (oom_has_pending_mm(memcg, NULL)) { > + /* Set a dummy value to return "true". */ > + chosen = (void *) 1; > + goto unlock; > + } > totalpages = mem_cgroup_get_limit(memcg) ? : 1; > for_each_mem_cgroup_tree(iter, memcg) { > struct css_task_iter it; > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 9f0022e..07e8c1a 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -132,6 +132,20 @@ static inline bool is_sysrq_oom(struct oom_control *oc) > return oc->order == -1; > } > > +static bool task_in_oom_domain(struct task_struct *p, struct mem_cgroup *memcg, > + const nodemask_t *nodemask) > +{ > + /* When mem_cgroup_out_of_memory() and p is not member of the group */ > + if (memcg && !task_in_mem_cgroup(p, memcg)) > + return false; > + > + /* p may not have freeable memory in nodemask */ > + if (!has_intersects_mems_allowed(p, nodemask)) > + return false; > + > + return true; > +} > + > /* return true if the task is not adequate as candidate victim task. */ > static bool oom_unkillable_task(struct task_struct *p, > struct mem_cgroup *memcg, const nodemask_t *nodemask) > @@ -141,15 +155,7 @@ static bool oom_unkillable_task(struct task_struct *p, > if (p->flags & PF_KTHREAD) > return true; > > - /* When mem_cgroup_out_of_memory() and p is not member of the group */ > - if (memcg && !task_in_mem_cgroup(p, memcg)) > - return true; > - > - /* p may not have freeable memory in nodemask */ > - if (!has_intersects_mems_allowed(p, nodemask)) > - return true; > - > - return false; > + return !task_in_oom_domain(p, memcg, nodemask); > } > > /** > @@ -275,6 +281,34 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, > } > #endif > > +static LIST_HEAD(oom_mm_list); > +static DEFINE_SPINLOCK(oom_mm_lock); > + > +void exit_oom_mm(struct mm_struct *mm) > +{ > + spin_lock(&oom_mm_lock); > + list_del(&mm->oom_mm.list); > + spin_unlock(&oom_mm_lock); > + put_task_struct(mm->oom_mm.victim); > + mmdrop(mm); > +} > + > +bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask) > +{ > + struct mm_struct *mm; > + bool ret = false; > + > + spin_lock(&oom_mm_lock); > + list_for_each_entry(mm, &oom_mm_list, oom_mm.list) { > + if (task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask)) { > + ret = true; > + break; > + } > + } > + spin_unlock(&oom_mm_lock); > + return ret; > +} > + > enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, > struct task_struct *task) > { > @@ -653,6 +687,8 @@ subsys_initcall(oom_init) > */ > void mark_oom_victim(struct task_struct *tsk) > { > + struct mm_struct *mm = tsk->mm; > + > WARN_ON(oom_killer_disabled); > /* OOM killer might race with memcg OOM */ > if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) > @@ -666,6 +702,18 @@ void mark_oom_victim(struct task_struct *tsk) > */ > __thaw_task(tsk); > atomic_inc(&oom_victims); > + /* > + * Since mark_oom_victim() is called from multiple threads, > + * connect this mm to oom_mm_list only if not yet connected. > + */ > + if (!mm->oom_mm.victim) { > + atomic_inc(&mm->mm_count); > + get_task_struct(tsk); > + mm->oom_mm.victim = tsk; > + spin_lock(&oom_mm_lock); > + list_add_tail(&mm->oom_mm.list, &oom_mm_list); > + spin_unlock(&oom_mm_lock); > + } > } > > /** > @@ -1026,6 +1074,9 @@ bool out_of_memory(struct oom_control *oc) > return true; > } > > + if (!is_sysrq_oom(oc) && oom_has_pending_mm(oc->memcg, oc->nodemask)) > + return true; > + > p = select_bad_process(oc, &points, totalpages); > /* Found nothing?!?! Either we hang forever, or we panic. */ > if (!p && !is_sysrq_oom(oc)) { > -- > 1.8.3.1 > -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>