On Mon, 12 Oct 2009 11:54:33 -0400 Prarit Bhargava <prarit@xxxxxxxxxx> wrote: > This patch was sent to me by Balbir Singh, cc'd, who worked on the original > patch. The patch results in a massive increase in performance on a 64p/32G > system. The patch was successfully compiled and tested by me on fedora-latest. > > >From the upstream commit: > > "Data from Prarit (kernel compile with make -j64 on a 64 > CPU/32G machine) > > For a single run > > Without patch > > real 27m8.988s > user 87m24.916s > sys 382m6.037s > > With patch > > real 4m18.607s > user 84m58.943s > sys 50m52.682s > > With config turned off > > real 4m54.972s > user 90m13.456s > sys 50m19.711s > > NOTE: The data looks counterintuitive due to the increased performance > with the patch, even over the config being turned off. We probably need > more runs, but so far all testing has shown that the patches definitely > help." > Yes, please import this change. thank you for backporting. MoreInfo: This patch itself works for "root" cgroup, which never has "limits". Patches for "child" cgroup is still under test in -mm. It's more complicated. Thanks, -Kame > ------------------------------------------------------------------------------- > > Backport 0c3e73e84fe3f64cf1c2e8bb4e91e8901cbcdc38 > > From: Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> > > (memcg: improve resource counter scalability) to 2.6.31. > It is a very useful patch for non-users of memory control > group as it reduces the overhead quite significantly. > > Signed-off-by: Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> > --- > > mm/memcontrol.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++--------- > 1 files changed, 106 insertions(+), 21 deletions(-) > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index fd4529d..4821be0 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -43,6 +43,7 @@ > > struct cgroup_subsys mem_cgroup_subsys __read_mostly; > #define MEM_CGROUP_RECLAIM_RETRIES 5 > +struct mem_cgroup *root_mem_cgroup __read_mostly; > > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP > /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ > @@ -66,6 +67,7 @@ enum mem_cgroup_stat_index { > MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ > MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ > MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ > + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ > > MEM_CGROUP_STAT_NSTATS, > }; > @@ -219,11 +221,24 @@ static void mem_cgroup_get(struct mem_cgroup *mem); > static void mem_cgroup_put(struct mem_cgroup *mem); > static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); > > +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, > + bool charge) > +{ > + int val = (charge) ? 1 : -1; > + struct mem_cgroup_stat *stat = &mem->stat; > + struct mem_cgroup_stat_cpu *cpustat; > + int cpu = get_cpu(); > + > + cpustat = &stat->cpustat[cpu]; > + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); > + put_cpu(); > +} > + > static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, > struct page_cgroup *pc, > bool charge) > { > - int val = (charge)? 1 : -1; > + int val = (charge) ? 1 : -1; > struct mem_cgroup_stat *stat = &mem->stat; > struct mem_cgroup_stat_cpu *cpustat; > int cpu = get_cpu(); > @@ -354,6 +369,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, > return ret; > } > > +static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) > +{ > + return (mem == root_mem_cgroup); > +} > + > /* > * Following LRU functions are allowed to be used without PCG_LOCK. > * Operations are called by routine of global LRU independently from memcg. > @@ -996,9 +1016,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, > VM_BUG_ON(css_is_removed(&mem->css)); > > while (1) { > - int ret; > + int ret = 0; > bool noswap = false; > > + if (mem_cgroup_is_root(mem)) > + goto done; > ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); > if (likely(!ret)) { > if (!do_swap_account) > @@ -1046,6 +1068,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, > goto nomem; > } > } > +done: > return 0; > nomem: > css_put(&mem->css); > @@ -1119,9 +1142,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, > lock_page_cgroup(pc); > if (unlikely(PageCgroupUsed(pc))) { > unlock_page_cgroup(pc); > - res_counter_uncharge(&mem->res, PAGE_SIZE); > - if (do_swap_account) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE); > + if (do_swap_account) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + } > css_put(&mem->css); > return; > } > @@ -1178,7 +1203,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, > if (pc->mem_cgroup != from) > goto out; > > - res_counter_uncharge(&from->res, PAGE_SIZE); > + if (!mem_cgroup_is_root(from)) > + res_counter_uncharge(&from->res, PAGE_SIZE); > mem_cgroup_charge_statistics(from, pc, false); > > page = pc->page; > @@ -1197,7 +1223,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, > 1); > } > > - if (do_swap_account) > + if (do_swap_account && !mem_cgroup_is_root(from)) > res_counter_uncharge(&from->memsw, PAGE_SIZE); > css_put(&from->css); > > @@ -1268,9 +1294,11 @@ uncharge: > /* drop extra refcnt by try_charge() */ > css_put(&parent->css); > /* uncharge if move fails */ > - res_counter_uncharge(&parent->res, PAGE_SIZE); > - if (do_swap_account) > - res_counter_uncharge(&parent->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(parent)) { > + res_counter_uncharge(&parent->res, PAGE_SIZE); > + if (do_swap_account) > + res_counter_uncharge(&parent->memsw, PAGE_SIZE); > + } > return ret; > } > > @@ -1459,7 +1487,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, > * This recorded memcg can be obsolete one. So, avoid > * calling css_tryget > */ > - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(memcg)) > + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > + mem_cgroup_swap_statistics(memcg, false); > mem_cgroup_put(memcg); > } > rcu_read_unlock(); > @@ -1484,9 +1514,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) > return; > if (!mem) > return; > - res_counter_uncharge(&mem->res, PAGE_SIZE); > - if (do_swap_account) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE); > + if (do_swap_account) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + } > css_put(&mem->css); > } > > @@ -1538,9 +1570,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > break; > } > > - res_counter_uncharge(&mem->res, PAGE_SIZE); > - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE); > + if (do_swap_account && > + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE); > + } > + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) > + mem_cgroup_swap_statistics(mem, true); > mem_cgroup_charge_statistics(mem, pc, false); > > ClearPageCgroupUsed(pc); > @@ -1629,7 +1666,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) > * We uncharge this because swap is freed. > * This memcg can be obsolete one. We avoid calling css_tryget > */ > - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > + if (!mem_cgroup_is_root(memcg)) > + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); > + mem_cgroup_swap_statistics(memcg, false); > mem_cgroup_put(memcg); > } > rcu_read_unlock(); > @@ -2046,20 +2085,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, > return retval; > } > > +struct mem_cgroup_idx_data { > + s64 val; > + enum mem_cgroup_stat_index idx; > +}; > + > +static int > +mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) > +{ > + struct mem_cgroup_idx_data *d = data; > + d->val += mem_cgroup_read_stat(&mem->stat, d->idx); > + return 0; > +} > + > +static void > +mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, > + enum mem_cgroup_stat_index idx, s64 *val) > +{ > + struct mem_cgroup_idx_data d; > + d.idx = idx; > + d.val = 0; > + mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); > + *val = d.val; > +} > + > static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) > { > struct mem_cgroup *mem = mem_cgroup_from_cont(cont); > - u64 val = 0; > + u64 idx_val, val; > int type, name; > > type = MEMFILE_TYPE(cft->private); > name = MEMFILE_ATTR(cft->private); > switch (type) { > case _MEM: > - val = res_counter_read_u64(&mem->res, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + mem_cgroup_get_recursive_idx_stat(mem, > + MEM_CGROUP_STAT_CACHE, &idx_val); > + val = idx_val; > + mem_cgroup_get_recursive_idx_stat(mem, > + MEM_CGROUP_STAT_RSS, &idx_val); > + val += idx_val; > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->res, name); > break; > case _MEMSWAP: > - val = res_counter_read_u64(&mem->memsw, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + mem_cgroup_get_recursive_idx_stat(mem, > + MEM_CGROUP_STAT_CACHE, &idx_val); > + val = idx_val; > + mem_cgroup_get_recursive_idx_stat(mem, > + MEM_CGROUP_STAT_RSS, &idx_val); > + val += idx_val; > + mem_cgroup_get_recursive_idx_stat(mem, > + MEM_CGROUP_STAT_SWAPOUT, &idx_val); > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->memsw, name); > break; > default: > BUG(); > @@ -2548,6 +2631,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > /* root ? */ > if (cont->parent == NULL) { > enable_swap_cgroup(); > + root_mem_cgroup = mem; > parent = NULL; > } else { > parent = mem_cgroup_from_cont(cont->parent); > @@ -2577,6 +2661,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > return &mem->css; > free_out: > __mem_cgroup_free(mem); > + root_mem_cgroup = NULL; > return ERR_PTR(error); > } > > > _______________________________________________ > Fedora-kernel-list mailing list > Fedora-kernel-list@xxxxxxxxxx > https://www.redhat.com/mailman/listinfo/fedora-kernel-list > _______________________________________________ Fedora-kernel-list mailing list Fedora-kernel-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/fedora-kernel-list