I like the gist of this patch. A few comments below. On Wed, Jul 08, 2015 at 02:27:51PM +0200, Michal Hocko wrote: [...] > diff --git a/fs/exec.c b/fs/exec.c > index 1977c2a553ac..3ed9c0abc9f5 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -870,7 +870,7 @@ static int exec_mmap(struct mm_struct *mm) > up_read(&old_mm->mmap_sem); > BUG_ON(active_mm != old_mm); > setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); > - mm_update_next_owner(old_mm); > + mm_inherit_memcg(mm, old_mm); > mmput(old_mm); > return 0; > } > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 78e9d4ac57a1..8e6b2444ebfe 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -274,6 +274,52 @@ struct mem_cgroup { > extern struct cgroup_subsys_state *mem_cgroup_root_css; > > /** > + * __mm_set_memcg - Set mm_struct:memcg to a given memcg. > + * @mm: mm struct > + * @memcg: mem_cgroup to be used > + * > + * Note that this function doesn't clean up the previous mm->memcg. > + * This should be done by caller when necessary (e.g. when moving > + * mm from one memcg to another). > + */ > +static inline > +void __mm_set_memcg(struct mm_struct *mm, struct mem_cgroup *memcg) > +{ > + if (memcg) > + css_get(&memcg->css); > + rcu_assign_pointer(mm->memcg, memcg); > +} > + > +/** > + * mm_inherit_memcg - Initialize mm_struct::memcg from an existing mm_struct > + * @newmm: new mm struct > + * @oldmm: old mm struct to inherit from > + * > + * Should be called for each new mm_struct. > + */ > +static inline > +void mm_inherit_memcg(struct mm_struct *newmm, struct mm_struct *oldmm) > +{ > + struct mem_cgroup *memcg = oldmm->memcg; FWIW, if CONFIG_SPARSE_RCU_POINTER is on, this will trigger a compile time warning, as well as any unannotated dereference of mm_struct->memcg below. > + > + __mm_set_memcg(newmm, memcg); > +} > + > +/** > + * mm_drop_iter - drop mm_struct::memcg association s/mm_drop_iter/mm_drop_memcg > + * @mm: mm struct > + * > + * Should be called after the mm has been removed from all tasks > + * and before it is freed (e.g. from mmput) > + */ > +static inline void mm_drop_memcg(struct mm_struct *mm) > +{ > + if (mm->memcg) > + css_put(&mm->memcg->css); > + mm->memcg = NULL; > +} > + > +/** > * mem_cgroup_events - count memory events against a cgroup > * @memcg: the memory cgroup > * @idx: the event index > @@ -305,7 +351,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); > bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); > > struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); > -struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); > > struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); > static inline > @@ -335,7 +380,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, > bool match = false; > > rcu_read_lock(); > - task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); > + task_memcg = rcu_dereference(mm->memcg); > if (task_memcg) > match = mem_cgroup_is_descendant(task_memcg, memcg); > rcu_read_unlock(); > @@ -474,7 +519,7 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, > return; > > rcu_read_lock(); > - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); > + memcg = rcu_dereference(mm->memcg); > if (unlikely(!memcg)) > goto out; > If I'm not mistaken, mm->memcg equals NULL for any task in the root memory cgroup (BTW, it it's true, it's worth mentioning in the comment to mm->memcg definition IMO). As a result, we won't account the stats for such tasks, will we? [...] > @@ -4749,37 +4748,49 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, > * tunable will only affect upcoming migrations, not the current one. > * So we need to save it, and keep it going. > */ > - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); > + move_flags = READ_ONCE(to->move_charge_at_immigrate); > if (!move_flags) > return 0; > > p = cgroup_taskset_first(tset); > - from = mem_cgroup_from_task(p); > - > - VM_BUG_ON(from == memcg); > + if (!thread_group_leader(p)) > + return 0; > > mm = get_task_mm(p); > if (!mm) > return 0; > - /* We move charges only when we move a owner of the mm */ > - if (mm->owner == p) { > - VM_BUG_ON(mc.from); > - VM_BUG_ON(mc.to); > - VM_BUG_ON(mc.precharge); > - VM_BUG_ON(mc.moved_charge); > - VM_BUG_ON(mc.moved_swap); > - > - spin_lock(&mc.lock); > - mc.from = from; > - mc.to = memcg; > - mc.flags = move_flags; > - spin_unlock(&mc.lock); > - /* We set mc.moving_task later */ > - > - ret = mem_cgroup_precharge_mc(mm); > - if (ret) > - mem_cgroup_clear_mc(); > - } > + > + /* > + * tasks' cgroup might be different from the one p->mm is associated > + * with because CLONE_VM is allowed without CLONE_THREAD. The task is > + * moving so we have to migrate from the memcg associated with its > + * address space. > + * No need to take a reference here because the memcg is pinned by the > + * mm_struct. > + */ But after we drop the reference to the mm below, mc.from can pass away and we can get use-after-free in mem_cgroup_move_task, can't we? AFAIU the real reason why we can skip taking a reference to mc.from, as well as to mc.to, is that task migration proceeds under cgroup_mutex, which blocks cgroup destruction. Am I missing something? If not, please remove this comment, because it's confusing. > + from = READ_ONCE(mm->memcg); > + if (!from) > + from = root_mem_cgroup; > + if (from == to) > + goto out; > + > + VM_BUG_ON(mc.from); > + VM_BUG_ON(mc.to); > + VM_BUG_ON(mc.precharge); > + VM_BUG_ON(mc.moved_charge); > + VM_BUG_ON(mc.moved_swap); > + > + spin_lock(&mc.lock); > + mc.from = from; > + mc.to = to; > + mc.flags = move_flags; > + spin_unlock(&mc.lock); > + /* We set mc.moving_task later */ > + > + ret = mem_cgroup_precharge_mc(mm); > + if (ret) > + mem_cgroup_clear_mc(); > +out: > mmput(mm); > return ret; > } > @@ -4932,14 +4943,26 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, > { > struct task_struct *p = cgroup_taskset_first(tset); > struct mm_struct *mm = get_task_mm(p); > + struct mem_cgroup *old_memcg = NULL; > > if (mm) { > + old_memcg = READ_ONCE(mm->memcg); > + __mm_set_memcg(mm, mem_cgroup_from_css(css)); > + > if (mc.to) > mem_cgroup_move_charge(mm); > mmput(mm); > } > if (mc.to) > mem_cgroup_clear_mc(); > + > + /* > + * Be careful and drop the reference only after we are done because > + * p's task_css memcg might be different from p->memcg and nothing else > + * might be pinning the old memcg. > + */ > + if (old_memcg) > + css_put(&old_memcg->css); Please explain why the following race is impossible: CPU0 CPU1 ---- ---- [current = T] dup_mm or exec_mmap mm_inherit_memcg memcg = current->mm->memcg; mem_cgroup_move_task p = T; mm = get_task_mm(p); old_memcg = mm->memcg; css_put(&old_memcg->css); /* old_memcg can be freed now */ css_get(memcg); /* BUG */ > } > #else /* !CONFIG_MMU */ > static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, Thanks, Vladimir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>