Hi Johannes, On Wed, Sep 24, 2014 at 11:43:08AM -0400, Johannes Weiner wrote: > Memory is internally accounted in bytes, using spinlock-protected > 64-bit counters, even though the smallest accounting delta is a page. > The counter interface is also convoluted and does too many things. > > Introduce a new lockless word-sized page counter API, then change all > memory accounting over to it and remove the old one. The translation > from and to bytes then only happens when interfacing with userspace. > > Aside from the locking costs, this gets rid of the icky unsigned long > long types in the very heart of memcg, which is great for 32 bit and > also makes the code a lot more readable. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> It looks much better to me. A few comments below, mostly nit picking. [...] > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c2c75262a209..52c24119be69 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c [...] > @@ -1490,12 +1495,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) > */ > static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) > { > - unsigned long long margin; > + unsigned long margin = 0; > + unsigned long count; > + unsigned long limit; > > - margin = res_counter_margin(&memcg->res); > - if (do_swap_account) > - margin = min(margin, res_counter_margin(&memcg->memsw)); > - return margin >> PAGE_SHIFT; > + count = page_counter_read(&memcg->memory); > + limit = ACCESS_ONCE(memcg->memory.limit); > + if (count < limit) Nit: IMO this looks unwieldier and less readable than res_counter_margin. And two lines below we repeat this. > + margin = limit - count; > + > + if (do_swap_account) { > + count = page_counter_read(&memcg->memsw); > + limit = ACCESS_ONCE(memcg->memsw.limit); > + if (count < limit) > + margin = min(margin, limit - count); > + } > + > + return margin; > } > > int mem_cgroup_swappiness(struct mem_cgroup *memcg) [...] > @@ -1685,30 +1698,19 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) > } > > /* > - * Return the memory (and swap, if configured) limit for a memcg. > + * Return the memory (and swap, if configured) maximum consumption for a memcg. Nit: Why did you change the comment? Now it doesn't seem to be relevant. > */ > -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) > +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) > { > - u64 limit; > + unsigned long limit; > > - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); > - > - /* > - * Do not consider swap space if we cannot swap due to swappiness > - */ > + limit = memcg->memory.limit; > if (mem_cgroup_swappiness(memcg)) { > - u64 memsw; > + unsigned long memsw_limit; > > - limit += total_swap_pages << PAGE_SHIFT; > - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); > - > - /* > - * If memsw is finite and limits the amount of swap space > - * available to this memcg, return that limit. > - */ > - limit = min(limit, memsw); > + memsw_limit = memcg->memsw.limit; > + limit = min(limit + total_swap_pages, memsw_limit); > } > - > return limit; > } > [...] > @@ -4114,25 +4109,27 @@ out: > } > > static int memcg_activate_kmem(struct mem_cgroup *memcg, > - unsigned long long limit) > + unsigned long nr_pages) > { > int ret; > > mutex_lock(&activate_kmem_mutex); > - ret = __memcg_activate_kmem(memcg, limit); > + ret = __memcg_activate_kmem(memcg, nr_pages); > mutex_unlock(&activate_kmem_mutex); > return ret; > } > > static int memcg_update_kmem_limit(struct mem_cgroup *memcg, > - unsigned long long val) > + unsigned long limit) > { > int ret; > > + mutex_lock(&memcg_limit_mutex); > if (!memcg_kmem_is_active(memcg)) > - ret = memcg_activate_kmem(memcg, val); > + ret = memcg_activate_kmem(memcg, limit); I think we can now get rid of the activate_kmem_mutex, but this should be done separately of course. > else > - ret = res_counter_set_limit(&memcg->kmem, val); > + ret = page_counter_limit(&memcg->kmem, limit); > + mutex_unlock(&memcg_limit_mutex); > return ret; > } > [...] > @@ -5570,10 +5530,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > > - mem_cgroup_resize_limit(memcg, ULLONG_MAX); > - mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); > - memcg_update_kmem_limit(memcg, ULLONG_MAX); > - res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); > + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); > + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); > + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); I think we should do it only if memcg_kmem_is_active, but that's a different story. > + memcg->soft_limit = 0; > } > > #ifdef CONFIG_MMU [...] > diff --git a/mm/page_counter.c b/mm/page_counter.c > new file mode 100644 > index 000000000000..51c45921b8d1 > --- /dev/null > +++ b/mm/page_counter.c > @@ -0,0 +1,191 @@ > +/* > + * Lockless hierarchical page accounting & limiting > + * > + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner > + */ > +#include <linux/page_counter.h> > +#include <linux/atomic.h> > + > +/** > + * page_counter_cancel - take pages out of the local counter > + * @counter: counter > + * @nr_pages: number of pages to cancel > + * > + * Returns whether there are remaining pages in the counter. > + */ > +int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) > +{ > + long new; > + > + new = atomic_long_sub_return(nr_pages, &counter->count); > + > + if (WARN_ON_ONCE(new < 0)) > + atomic_long_add(nr_pages, &counter->count); > + > + return new > 0; > +} > + > +/** > + * page_counter_charge - hierarchically charge pages > + * @counter: counter > + * @nr_pages: number of pages to charge > + * > + * NOTE: This may exceed the configured counter limits. > + */ > +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) > +{ > + struct page_counter *c; > + > + for (c = counter; c; c = c->parent) { > + long new; > + > + new = atomic_long_add_return(nr_pages, &c->count); > + /* > + * This is racy, but with the per-cpu caches on top > + * it's just a ballpark metric anyway; and with lazy > + * cache reclaim, the majority of workloads peg the > + * watermark to the group limit soon after launch. > + */ > + if (new > c->watermark) > + c->watermark = new; > + } > +} > + > +/** > + * page_counter_try_charge - try to hierarchically charge pages > + * @counter: counter > + * @nr_pages: number of pages to charge > + * @fail: points first counter to hit its limit, if any > + * > + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of > + * its ancestors has hit its limit. > + */ > +int page_counter_try_charge(struct page_counter *counter, > + unsigned long nr_pages, > + struct page_counter **fail) > +{ > + struct page_counter *c; > + > + for (c = counter; c; c = c->parent) { > + long new; > + /* > + * Charge speculatively to avoid an expensive CAS. If > + * a bigger charge fails, it might falsely lock out a > + * racing smaller charge and send it into reclaim > + * eraly, but the error is limited to the difference Nit: s/eraly/early > + * between the two sizes, which is less than 2M/4M in > + * case of a THP locking out a regular page charge. > + */ > + new = atomic_long_add_return(nr_pages, &c->count); > + if (new > c->limit) { > + atomic_long_sub(nr_pages, &c->count); > + /* > + * This is racy, but the failcnt is only a > + * ballpark metric anyway. > + */ I still don't think that the failcnt is completely useless. As I mentioned previously, it can be used to check whether the workload is behaving badly due to memcg limits or for some other reason. And I don't see why it couldn't be atomic. This isn't a show stopper though. > + c->failcnt++; > + *fail = c; > + goto failed; > + } > + /* > + * This is racy, but with the per-cpu caches on top > + * it's just a ballpark metric anyway; and with lazy > + * cache reclaim, the majority of workloads peg the > + * watermark to the group limit soon after launch. Not for kmem, I think. > + */ > + if (new > c->watermark) > + c->watermark = new; > + } > + return 0; > + > +failed: > + for (c = counter; c != *fail; c = c->parent) > + page_counter_cancel(c, nr_pages); > + > + return -ENOMEM; > +} > + > +/** > + * page_counter_uncharge - hierarchically uncharge pages > + * @counter: counter > + * @nr_pages: number of pages to uncharge > + * > + * Returns whether there are remaining charges in @counter. > + */ > +int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) > +{ > + struct page_counter *c; > + int ret = 1; > + > + for (c = counter; c; c = c->parent) { > + int remainder; > + > + remainder = page_counter_cancel(c, nr_pages); > + if (c == counter && !remainder) > + ret = 0; > + } > + > + return ret; > +} > + > +/** > + * page_counter_limit - limit the number of pages allowed > + * @counter: counter > + * @limit: limit to set > + * > + * Returns 0 on success, -EBUSY if the current number of pages on the > + * counter already exceeds the specified limit. > + * > + * The caller must serialize invocations on the same counter. > + */ > +int page_counter_limit(struct page_counter *counter, unsigned long limit) > +{ > + for (;;) { > + unsigned long old; > + long count; > + > + count = atomic_long_read(&counter->count); > + > + old = xchg(&counter->limit, limit); Why do you use xchg here? > + > + if (atomic_long_read(&counter->count) != count) { > + counter->limit = old; > + continue; > + } > + > + if (count > limit) { > + counter->limit = old; > + return -EBUSY; > + } I have a suspicion that this can race with page_counter_try_charge. Look, access to c->limit is not marked as volatile in try_charge so the compiler is allowed to issue read only once, in the very beginning of the try_charge function. Then try_charge may succeed after the limit was actually updated to a smaller value. Strictly speaking, using ACCESS_ONCE in try_charge wouldn't be enough AFAIU. There must be memory barriers here and there. > + > + return 0; > + } > +} > + > +/** > + * page_counter_memparse - memparse() for page counter limits > + * @buf: string to parse > + * @nr_pages: returns the result in number of pages > + * > + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be > + * limited to %PAGE_COUNTER_MAX. > + */ > +int page_counter_memparse(const char *buf, unsigned long *nr_pages) > +{ > + char unlimited[] = "-1"; > + char *end; > + u64 bytes; > + > + if (!strncmp(buf, unlimited, sizeof(unlimited))) { > + *nr_pages = PAGE_COUNTER_MAX; > + return 0; > + } > + > + bytes = memparse(buf, &end); > + if (*end != '\0') > + return -EINVAL; > + > + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); > + > + return 0; > +} > diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c > index 1d191357bf88..272327134a1b 100644 > --- a/net/ipv4/tcp_memcontrol.c > +++ b/net/ipv4/tcp_memcontrol.c [...] > @@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, > return ret ?: nbytes; > } > > -static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) > -{ > - struct cg_proto *cg_proto; > - > - cg_proto = tcp_prot.proto_cgroup(memcg); > - if (!cg_proto) > - return default_val; > - > - return res_counter_read_u64(&cg_proto->memory_allocated, type); > -} > - > -static u64 tcp_read_usage(struct mem_cgroup *memcg) > -{ > - struct cg_proto *cg_proto; > - > - cg_proto = tcp_prot.proto_cgroup(memcg); > - if (!cg_proto) > - return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; > - > - return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); > -} > - > static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > + struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); > u64 val; > > switch (cft->private) { > case RES_LIMIT: > - val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); > + if (!cg_proto) > + return PAGE_COUNTER_MAX; For compatibility it must be ULLONG_MAX. > + val = cg_proto->memory_allocated.limit; > + val *= PAGE_SIZE; > break; > case RES_USAGE: > - val = tcp_read_usage(memcg); > + if (!cg_proto) > + val = atomic_long_read(&tcp_memory_allocated); > + else > + val = page_counter_read(&cg_proto->memory_allocated); > + val *= PAGE_SIZE; > break; > case RES_FAILCNT: > + if (!cg_proto) > + return 0; > + val = cg_proto->memory_allocated.failcnt; > + break; > case RES_MAX_USAGE: > - val = tcp_read_stat(memcg, cft->private, 0); > + if (!cg_proto) > + return 0; > + val = cg_proto->memory_allocated.watermark; > + val *= PAGE_SIZE; > break; > default: > BUG(); [...] Thanks, Vladimir -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html