On Tue, 2 Mar 2010 13:55:24 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > Very sorry, mutex_lock is called after prepare_to_wait. > This is a fixed one. I'm willing to test your patch, but I have one concern. > +/* > + * try to call OOM killer. returns false if we should exit memory-reclaim loop. > + */ > +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > { > - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); > + DEFINE_WAIT(wait); > + bool locked; > + > + /* At first, try to OOM lock hierarchy under mem.*/ > + mutex_lock(&memcg_oom_mutex); > + locked = mem_cgroup_oom_lock(mem); > + if (!locked) > + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE); > + mutex_unlock(&memcg_oom_mutex); > + > + if (locked) > + mem_cgroup_out_of_memory(mem, mask); > + else { > + schedule(); > + finish_wait(&memcg_oom_waitq, &wait); > + } > + mutex_lock(&memcg_oom_mutex); > + mem_cgroup_oom_unlock(mem); > + /* TODO: more fine grained waitq ? */ > + wake_up_all(&memcg_oom_waitq); > + mutex_unlock(&memcg_oom_mutex); > + > + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) > + return false; > + /* Give chance to dying process */ > + schedule_timeout(1); > + return true; > } > Isn't there such race conditions ? context A context B mutex_lock(&memcg_oom_mutex) mem_cgroup_oom_lock() ->success mutex_unlock(&memcg_oom_mutex) mem_cgroup_out_of_memory() mutex_lock(&memcg_oom_mutex) mem_cgroup_oom_lock() ->fail prepare_to_wait() mutex_unlock(&memcg_oom_mutex) mutex_lock(&memcg_oom_mutex) mem_cgroup_oom_unlock() wake_up_all() mutex_unlocklock(&memcg_oom_mutex) schedule() finish_wait() In this case, context B will not be waken up, right? Thanks, Daisuke Nishimura. > == > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > > In current page-fault code, > > handle_mm_fault() > -> ... > -> mem_cgroup_charge() > -> map page or handle error. > -> check return code. > > If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() > is called. But if it's caused by memcg, OOM should have been already > invoked. > Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6 > > That patch records last_oom_jiffies for memcg's sub-hierarchy and > prevents page_fault_out_of_memory from being invoked in near future. > > But Nishimura-san reported that check by jiffies is not enough > when the system is terribly heavy. > > This patch changes memcg's oom logic as. > * If memcg causes OOM-kill, continue to retry. > * remove jiffies check which is used now. > * add memcg-oom-lock which works like perzone oom lock. > * If current is killed(as a process), bypass charge. > > Something more sophisticated can be added but this pactch does > fundamental things. > TODO: > - add oom notifier > - add permemcg disable-oom-kill flag and freezer at oom. > - more chances for wake up oom waiter (when changing memory limit etc..) > > Changelog; > - fixed mutex and prepare_to_wait order. > - fixed per-memcg oom lock. > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > --- > include/linux/memcontrol.h | 6 -- > mm/memcontrol.c | 109 +++++++++++++++++++++++++++++++++------------ > mm/oom_kill.c | 8 --- > 3 files changed, 82 insertions(+), 41 deletions(-) > > Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h > =================================================================== > --- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h > +++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h > @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v > return false; > } > > -extern bool mem_cgroup_oom_called(struct task_struct *task); > void mem_cgroup_update_file_mapped(struct page *page, int val); > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask, int nid, > @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v > return true; > } > > -static inline bool mem_cgroup_oom_called(struct task_struct *task) > -{ > - return false; > -} > - > static inline int > mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) > { > Index: mmotm-2.6.33-Feb11/mm/memcontrol.c > =================================================================== > --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c > +++ mmotm-2.6.33-Feb11/mm/memcontrol.c > @@ -200,7 +200,7 @@ struct mem_cgroup { > * Should the accounting and control be hierarchical, per subtree? > */ > bool use_hierarchy; > - unsigned long last_oom_jiffies; > + atomic_t oom_lock; > atomic_t refcnt; > > unsigned int swappiness; > @@ -1234,32 +1234,77 @@ static int mem_cgroup_hierarchical_recla > return total; > } > > -bool mem_cgroup_oom_called(struct task_struct *task) > +static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) > { > - bool ret = false; > - struct mem_cgroup *mem; > - struct mm_struct *mm; > + int *val = (int *)data; > + int x; > > - rcu_read_lock(); > - mm = task->mm; > - if (!mm) > - mm = &init_mm; > - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); > - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) > - ret = true; > - rcu_read_unlock(); > - return ret; > + x = atomic_inc_return(&mem->oom_lock); > + if (x > *val) > + *val = x; > + return 0; > } > +/* > + * Check OOM-Killer is already running under our hierarchy. > + * If someone is running, return false. > + */ > +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) > +{ > + int check = 0; > + > + mem_cgroup_walk_tree(mem, &check, mem_cgroup_oom_lock_cb); > > -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) > + if (check == 1) > + return true; > + return false; > +} > + > +static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) > { > - mem->last_oom_jiffies = jiffies; > + atomic_dec(&mem->oom_lock); > return 0; > } > > -static void record_last_oom(struct mem_cgroup *mem) > +static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) > +{ > + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); > +} > + > +static DEFINE_MUTEX(memcg_oom_mutex); > +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); > + > +/* > + * try to call OOM killer. returns false if we should exit memory-reclaim loop. > + */ > +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) > { > - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); > + DEFINE_WAIT(wait); > + bool locked; > + > + /* At first, try to OOM lock hierarchy under mem.*/ > + mutex_lock(&memcg_oom_mutex); > + locked = mem_cgroup_oom_lock(mem); > + if (!locked) > + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE); > + mutex_unlock(&memcg_oom_mutex); > + > + if (locked) > + mem_cgroup_out_of_memory(mem, mask); > + else { > + schedule(); > + finish_wait(&memcg_oom_waitq, &wait); > + } > + mutex_lock(&memcg_oom_mutex); > + mem_cgroup_oom_unlock(mem); > + /* TODO: more fine grained waitq ? */ > + wake_up_all(&memcg_oom_waitq); > + mutex_unlock(&memcg_oom_mutex); > + > + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) > + return false; > + /* Give chance to dying process */ > + schedule_timeout(1); > + return true; > } > > /* > @@ -1432,11 +1477,14 @@ static int __mem_cgroup_try_charge(struc > struct res_counter *fail_res; > int csize = CHARGE_SIZE; > > - if (unlikely(test_thread_flag(TIF_MEMDIE))) { > - /* Don't account this! */ > - *memcg = NULL; > - return 0; > - } > + /* > + * Unlike gloval-vm's OOM-kill, we're not in memory shortage > + * in system level. So, allow to go ahead dying process in addition to > + * MEMDIE process. > + */ > + if (unlikely(test_thread_flag(TIF_MEMDIE) > + || fatal_signal_pending(current))) > + goto bypass; > > /* > * We always charge the cgroup the mm_struct belongs to. > @@ -1549,11 +1597,15 @@ static int __mem_cgroup_try_charge(struc > } > > if (!nr_retries--) { > - if (oom) { > - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); > - record_last_oom(mem_over_limit); > + if (!oom) > + goto nomem; > + if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { > + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; > + continue; > } > - goto nomem; > + /* When we reach here, current task is dying .*/ > + css_put(&mem->css); > + goto bypass; > } > } > if (csize > PAGE_SIZE) > @@ -1572,6 +1624,9 @@ done: > nomem: > css_put(&mem->css); > return -ENOMEM; > +bypass: > + *memcg = NULL; > + return 0; > } > > /* > Index: mmotm-2.6.33-Feb11/mm/oom_kill.c > =================================================================== > --- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c > +++ mmotm-2.6.33-Feb11/mm/oom_kill.c > @@ -599,13 +599,6 @@ void pagefault_out_of_memory(void) > /* Got some memory back in the last second. */ > return; > > - /* > - * If this is from memcg, oom-killer is already invoked. > - * and not worth to go system-wide-oom. > - */ > - if (mem_cgroup_oom_called(current)) > - goto rest_and_return; > - > if (sysctl_panic_on_oom) > panic("out of memory from page fault. panic_on_oom is selected.\n"); > > @@ -617,7 +610,6 @@ void pagefault_out_of_memory(void) > * Give "p" a good chance of killing itself before we > * retry to allocate memory. > */ > -rest_and_return: > if (!test_thread_flag(TIF_MEMDIE)) > schedule_timeout_uninterruptible(1); > } > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>