KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> writes: > On Tue, 24 Aug 2010 00:47:50 -0700 > Greg Thelen <gthelen@xxxxxxxxxx> wrote: > >> KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> writes: >> >> > I have an idea to remove page_cgroup->page pointer, 8bytes reduction per page. >> > But it will be after this work. >> > == >> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> >> > >> > Now, addresses of memory cgroup can be calculated by their ID without complex. >> > This patch relplaces pc->mem_cgroup from a pointer to a unsigned short. >> > On 64bit architecture, this offers us more 6bytes room per page_cgroup. >> > Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for >> > some light-weight concurrent access. >> > >> > We may able to move this id onto flags field but ...go step by step. >> > >> > Changelog: 20100811 >> > - using new rcu APIs, as rcu_dereference_check() etc. >> > Changelog: 20100804 >> > - added comments to page_cgroup.h >> > Changelog: 20100730 >> > - fixed some garbage added by debug code in early stage >> > >> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> >> > --- >> > include/linux/page_cgroup.h | 6 ++++- >> > mm/memcontrol.c | 52 +++++++++++++++++++++++++++----------------- >> > mm/page_cgroup.c | 2 - >> > 3 files changed, 38 insertions(+), 22 deletions(-) >> > >> > Index: mmotm-0811/include/linux/page_cgroup.h >> > =================================================================== >> > --- mmotm-0811.orig/include/linux/page_cgroup.h >> > +++ mmotm-0811/include/linux/page_cgroup.h >> > @@ -9,10 +9,14 @@ >> > * page_cgroup helps us identify information about the cgroup >> > * All page cgroups are allocated at boot or memory hotplug event, >> > * then the page cgroup for pfn always exists. >> > + * >> > + * TODO: It seems ID for cgroup can be packed into "flags". But there will >> > + * be race between assigning ID <-> set/clear flags. Please be careful. >> > */ >> > struct page_cgroup { >> > unsigned long flags; >> > - struct mem_cgroup *mem_cgroup; >> > + unsigned short mem_cgroup; /* ID of assigned memory cgroup */ >> > + unsigned short blk_cgroup; /* Not Used..but will be. */ >> > struct page *page; >> > struct list_head lru; /* per cgroup LRU list */ >> > }; >> > Index: mmotm-0811/mm/page_cgroup.c >> > =================================================================== >> > --- mmotm-0811.orig/mm/page_cgroup.c >> > +++ mmotm-0811/mm/page_cgroup.c >> > @@ -15,7 +15,7 @@ static void __meminit >> > __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) >> > { >> > pc->flags = 0; >> > - pc->mem_cgroup = NULL; >> > + pc->mem_cgroup = 0; >> > pc->page = pfn_to_page(pfn); >> > INIT_LIST_HEAD(&pc->lru); >> > } >> > Index: mmotm-0811/mm/memcontrol.c >> > =================================================================== >> > --- mmotm-0811.orig/mm/memcontrol.c >> > +++ mmotm-0811/mm/memcontrol.c >> > @@ -300,12 +300,13 @@ static atomic_t mem_cgroup_num; >> > #define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1) >> > static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly; >> > >> > -/* Must be called under rcu_read_lock */ >> > -static struct mem_cgroup *id_to_memcg(unsigned short id) >> > +/* Must be called under rcu_read_lock, set safe==true if under lock */ >> > +static struct mem_cgroup *id_to_memcg(unsigned short id, bool safe) >> > { >> > struct mem_cgroup *ret; >> > /* see mem_cgroup_free() */ >> > - ret = rcu_dereference_check(mem_cgroups[id], rch_read_lock_held()); >> > + ret = rcu_dereference_check(mem_cgroups[id], >> > + rch_read_lock_held() || safe); >> > if (likely(ret && ret->valid)) >> > return ret; >> > return NULL; >> > @@ -381,7 +382,12 @@ struct cgroup_subsys_state *mem_cgroup_c >> > static struct mem_cgroup_per_zone * >> > page_cgroup_zoneinfo(struct page_cgroup *pc) >> > { >> > - struct mem_cgroup *mem = pc->mem_cgroup; >> > + /* >> > + * The caller should guarantee this "pc" is under lock. In typical >> > + * case, this function is called by lru function with zone->lru_lock. >> > + * It is a safe access. >> > + */ >> > + struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup, true); >> > int nid = page_cgroup_nid(pc); >> > int zid = page_cgroup_zid(pc); >> > >> > @@ -723,6 +729,11 @@ static inline bool mem_cgroup_is_root(st >> > return (mem == root_mem_cgroup); >> > } >> > >> > +static inline bool mem_cgroup_is_rootid(unsigned short id) >> > +{ >> > + return (id == 1); >> > +} >> > + >> > /* >> > * Following LRU functions are allowed to be used without PCG_LOCK. >> > * Operations are called by routine of global LRU independently from memcg. >> > @@ -755,7 +766,7 @@ void mem_cgroup_del_lru_list(struct page >> > */ >> > mz = page_cgroup_zoneinfo(pc); >> > MEM_CGROUP_ZSTAT(mz, lru) -= 1; >> > - if (mem_cgroup_is_root(pc->mem_cgroup)) >> > + if (mem_cgroup_is_rootid(pc->mem_cgroup)) >> > return; >> > VM_BUG_ON(list_empty(&pc->lru)); >> > list_del_init(&pc->lru); >> > @@ -782,7 +793,7 @@ void mem_cgroup_rotate_lru_list(struct p >> > */ >> > smp_rmb(); >> > /* unused or root page is not rotated. */ >> > - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) >> > + if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup)) >> > return; >> > mz = page_cgroup_zoneinfo(pc); >> > list_move(&pc->lru, &mz->lists[lru]); >> > @@ -808,7 +819,7 @@ void mem_cgroup_add_lru_list(struct page >> > mz = page_cgroup_zoneinfo(pc); >> > MEM_CGROUP_ZSTAT(mz, lru) += 1; >> > SetPageCgroupAcctLRU(pc); >> > - if (mem_cgroup_is_root(pc->mem_cgroup)) >> > + if (mem_cgroup_is_rootid(pc->mem_cgroup)) >> > return; >> > list_add(&pc->lru, &mz->lists[lru]); >> > } >> > @@ -1497,7 +1508,7 @@ void mem_cgroup_update_file_mapped(struc >> > return; >> > >> > lock_page_cgroup(pc); >> > - mem = pc->mem_cgroup; >> > + mem = id_to_memcg(pc->mem_cgroup, true); >> > if (!mem || !PageCgroupUsed(pc)) >> > goto done; >> > >> > @@ -1885,14 +1896,14 @@ struct mem_cgroup *try_get_mem_cgroup_fr >> > pc = lookup_page_cgroup(page); >> > lock_page_cgroup(pc); >> > if (PageCgroupUsed(pc)) { >> > - mem = pc->mem_cgroup; >> > + mem = id_to_memcg(pc->mem_cgroup, true); >> > if (mem && !css_tryget(&mem->css)) >> > mem = NULL; >> > } else if (PageSwapCache(page)) { >> > ent.val = page_private(page); >> > id = lookup_swap_cgroup(ent); >> > rcu_read_lock(); >> > - mem = id_to_memcg(id); >> > + mem = id_to_memcg(id, false); >> > if (mem && !css_tryget(&mem->css)) >> > mem = NULL; >> > rcu_read_unlock(); >> > @@ -1921,7 +1932,7 @@ static void __mem_cgroup_commit_charge(s >> > return; >> > } >> > >> > - pc->mem_cgroup = mem; >> > + pc->mem_cgroup = css_id(&mem->css); >> > /* >> > * We access a page_cgroup asynchronously without lock_page_cgroup(). >> > * Especially when a page_cgroup is taken from a page, pc->mem_cgroup >> > @@ -1979,7 +1990,7 @@ static void __mem_cgroup_move_account(st >> > VM_BUG_ON(PageLRU(pc->page)); >> > VM_BUG_ON(!PageCgroupLocked(pc)); >> >> Should this be VM_BUG_ON(!rcu_read_lock_held())? I suspect that >> mem_cgroup_move_account() should grab rcu read lock (see my comment below). >> > > No. (see below) > > >> > VM_BUG_ON(!PageCgroupUsed(pc)); >> > - VM_BUG_ON(pc->mem_cgroup != from); >> > + VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from); >> > >> > if (PageCgroupFileMapped(pc)) { >> > /* Update mapped_file data for mem_cgroup */ >> > @@ -1994,7 +2005,7 @@ static void __mem_cgroup_move_account(st >> > mem_cgroup_cancel_charge(from); >> > >> > /* caller should have done css_get */ >> > - pc->mem_cgroup = to; >> > + pc->mem_cgroup = css_id(&to->css); >> > mem_cgroup_charge_statistics(to, pc, true); >> > /* >> > * We charges against "to" which may not have any tasks. Then, "to" >> > @@ -2014,11 +2025,11 @@ static int mem_cgroup_move_account(struc >> > { >> > int ret = -EINVAL; >> > lock_page_cgroup(pc); >> >> Should this be rcu_read_lock() instead of lock_page_cgroup()? >> > > No. > - lock_page_cgroup() is for keeping page_cgroup's status stable. > - rcu_read_lock() is for delaying to discard mem_cgroup object. > > rcu_read_lock() is just for delaying to discard object, not for avoiding > racy updates. All _updates_ requires proper lock or speculative logic as > atomic_inc_not_zero() etc... Basically, RCU is for avoiding use-after-free. Thanks for the info. Referring to your original patch: > @@ -2014,11 +2025,11 @@ static int mem_cgroup_move_account(struc > { > int ret = -EINVAL; > lock_page_cgroup(pc); > - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { > + if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup, true) == from) { > __mem_cgroup_move_account(pc, from, to, uncharge); > ret = 0; > } > - unlock_page_cgroup(pc); > + rcu_read_unlock(); > It seems like mem_cgroup_move_account() is not balanced. Why is lock_page_cgroup(pc) used to lock but rcu_read_unlock() used to unlock? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>