On Thu, 7 Oct 2010 16:28:11 +0900 Daisuke Nishimura <nishimura@xxxxxxxxxxxxxxxxx> wrote: > On Thu, 7 Oct 2010 15:21:11 +0900 > KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > > > On Thu, 7 Oct 2010 11:17:43 +0900 > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > > > > > > hmm, if we'll do that, I think we need to do that under pte_lock in > > > > mem_cgroup_move_charge_pte_range(). But, we can't do wait_on_page_writeback() > > > > under pte_lock, right? Or, we need re-organize current move-charge implementation. > > > > > > > Nice catch. I think releaseing pte_lock() is okay. (and it should be released) > > > > > > IIUC, task's css_set() points to new cgroup when "move" is called. Then, > > > it's not necessary to take pte_lock, I guess. > > > (And taking pte_lock too long is not appreciated..) > > > > > > I'll write a sample patch today. > > > > > Here. > Great! > > > == > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > > > > Now, at task migration among cgroup, memory cgroup scans page table and moving > > account if flags are properly set. > > > > The core code, mem_cgroup_move_charge_pte_range() does > > > > pte_offset_map_lock(); > > for all ptes in a page table: > > 1. look into page table, find_and_get a page > > 2. remove it from LRU. > > 3. move charge. > > 4. putback to LRU. put_page() > > pte_offset_map_unlock(); > > > > for pte entries on a 3rd(2nd) level page table. > > > > This pte_offset_map_lock seems a bit long. This patch modifies a rountine as > > > > pte_offset_map_lock() > > for 32 pages: > > find_and_get a page > > record it > > pte_offset_map_unlock() > > for all recorded pages > > isolate it from LRU. > > move charge > > putback to LRU > > for all recorded pages > > put_page() > > > > Note: newly-charged pages while we move account are charged to the new group. > > > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > > --- > > mm/memcontrol.c | 92 ++++++++++++++++++++++++++++++++++++-------------------- > > 1 file changed, 60 insertions(+), 32 deletions(-) > > > > Index: mmotm-0928/mm/memcontrol.c > > =================================================================== > > --- mmotm-0928.orig/mm/memcontrol.c > > +++ mmotm-0928/mm/memcontrol.c > > @@ -4475,17 +4475,22 @@ one_by_one: > > * > > * Called with pte lock held. > > */ > > -union mc_target { > > - struct page *page; > > - swp_entry_t ent; > > -}; > > > > enum mc_target_type { > > - MC_TARGET_NONE, /* not used */ > > + MC_TARGET_NONE, /* used as failure code(0) */ > > MC_TARGET_PAGE, > > MC_TARGET_SWAP, > > }; > > > > +struct mc_target { > > + enum mc_target_type type; > > + union { > > + struct page *page; > > + swp_entry_t ent; > > + } val; > > +}; > > + > > + > > static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > > unsigned long addr, pte_t ptent) > > { > > @@ -4561,7 +4566,7 @@ static struct page *mc_handle_file_pte(s > > } > > > > static int is_target_pte_for_mc(struct vm_area_struct *vma, > > - unsigned long addr, pte_t ptent, union mc_target *target) > > + unsigned long addr, pte_t ptent, struct mc_target *target) > > { > > struct page *page = NULL; > > struct page_cgroup *pc; > > @@ -4587,7 +4592,7 @@ static int is_target_pte_for_mc(struct v > > if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { > > ret = MC_TARGET_PAGE; > > if (target) > > - target->page = page; > > + target->val.page = page; > > } > > if (!ret || !target) > > put_page(page); > > @@ -4597,8 +4602,10 @@ static int is_target_pte_for_mc(struct v > > css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { > > ret = MC_TARGET_SWAP; > > if (target) > > - target->ent = ent; > > + target->val.ent = ent; > > } > > + if (target) > > + target->type = ret; > > return ret; > > } > > > > @@ -4751,6 +4758,9 @@ static void mem_cgroup_cancel_attach(str > > mem_cgroup_clear_mc(); > > } > > > > + > > +#define MC_MOVE_ONCE (32) > > + > > static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > > unsigned long addr, unsigned long end, > > struct mm_walk *walk) > > @@ -4759,26 +4769,47 @@ static int mem_cgroup_move_charge_pte_ra > > struct vm_area_struct *vma = walk->private; > > pte_t *pte; > > spinlock_t *ptl; > > + struct mc_target *target; > > + int index, num; > > + > > + target = kzalloc(sizeof(struct mc_target) *MC_MOVE_ONCE, GFP_KERNEL); > hmm? I can't see it freed anywhere. > leaked.. > Considering you reset target[]->type to MC_TARGET_NONE, do you intended to > reuse targe[] while walking the page table ? yes. > If so, how about adding a new member(struct mc_target *targe) to move_charge_struct, > and allocate/free it at mem_cgroup_move_charge() ? > Hmm, sounds nice. I'll do. Thanks, -Kame > Thanks, > Daisuke Nishimura. > > > + if (!target) > > + return -ENOMEM; > > > > retry: > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > - for (; addr != end; addr += PAGE_SIZE) { > > + for (num = 0; num < MC_MOVE_ONCE && addr != end; addr += PAGE_SIZE) { > > pte_t ptent = *(pte++); > > - union mc_target target; > > - int type; > > + ret = is_target_pte_for_mc(vma, addr, ptent, &target[num]); > > + if (!ret) > > + continue; > > + target[num++].type = ret; > > + } > > + pte_unmap_unlock(pte - 1, ptl); > > + cond_resched(); > > + > > + ret = 0; > > + index = 0; > > + do { > > + struct mc_target *mt; > > struct page *page; > > struct page_cgroup *pc; > > swp_entry_t ent; > > > > - if (!mc.precharge) > > - break; > > + if (!mc.precharge) { > > + ret = mem_cgroup_do_precharge(1); > > + if (ret) > > + goto out; > > + continue; > > + } > > + > > + mt = &target[index++]; > > > > - type = is_target_pte_for_mc(vma, addr, ptent, &target); > > - switch (type) { > > + switch (mt->type) { > > case MC_TARGET_PAGE: > > - page = target.page; > > + page = mt->val.page; > > if (isolate_lru_page(page)) > > - goto put; > > + break; > > pc = lookup_page_cgroup(page); > > if (!mem_cgroup_move_account(pc, > > mc.from, mc.to, false)) { > > @@ -4787,11 +4818,9 @@ retry: > > mc.moved_charge++; > > } > > putback_lru_page(page); > > -put: /* is_target_pte_for_mc() gets the page */ > > - put_page(page); > > break; > > case MC_TARGET_SWAP: > > - ent = target.ent; > > + ent = mt->val.ent; > > if (!mem_cgroup_move_swap_account(ent, > > mc.from, mc.to, false)) { > > mc.precharge--; > > @@ -4802,21 +4831,20 @@ put: /* is_target_pte_for_mc() gets th > > default: > > break; > > } > > + } while (index < num); > > +out: > > + for (index = 0; index < num; index++) { > > + if (target[index].type == MC_TARGET_PAGE) > > + put_page(target[index].val.page); > > + target[index].type = MC_TARGET_NONE; > > } > > - pte_unmap_unlock(pte - 1, ptl); > > + > > + if (ret) > > + return ret; > > cond_resched(); > > > > - if (addr != end) { > > - /* > > - * We have consumed all precharges we got in can_attach(). > > - * We try charge one by one, but don't do any additional > > - * charges to mc.to if we have failed in charge once in attach() > > - * phase. > > - */ > > - ret = mem_cgroup_do_precharge(1); > > - if (!ret) > > - goto retry; > > - } > > + if (addr != end) > > + goto retry; > > > > return ret; > > } > > > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>