Re: [RFC] [PATCH 4/7 v2] memcg: new scheme to update per-memcg page stat accounting.

Ying Han <yinghan@xxxxxxxxxx> · Thu, 26 Jan 2012 11:01:32 -0800

On Fri, Jan 13, 2012 at 12:41 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> From 08a81022fa6f820a42aa5bf3a24ee07690dfff99 Mon Sep 17 00:00:00 2001
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> Date: Thu, 12 Jan 2012 18:13:32 +0900
> Subject: [PATCH 4/7] memcg: new scheme to update per-memcg page stat accounting.
>
> Now, page status accounting is done by a call mem_cgroup_update_page_stat()
nitpick, /by a call/by calling

> and this function set flags to page_cgroup->flags.
>
> This flag was required because the page's status and page <=> memcg
> relationship cannot be updated in atomic way.

I assume we are talking about the PCG_FILE_MAPPED flag, can we make it
specific here?

For example,
> Considering FILE_MAPPED,
>
>        CPU A                      CPU B
>                                pick up a page to be moved.
>    set page_mapcount()=0.
>                                move memcg' FILE_MAPPED stat --(*)
>                                overwrite pc->mem_cgroup
>    modify memcg's FILE_MAPPED-(**)
>
> If we don't have a flag on pc->flags, we'll not move 'FILE_MAPPED'
> account information in (*) and we'll decrease FILE_MAPPED in (**)
> from wrong cgroup. We'll see this kind of race at handling
> dirty, writeback...etc..bits. (And Dirty flag has another problem
> which cannot be handled by flag on page_cgroup.)
>
> I'd like to remove this flag because
>  - In recent discussions, removing pc->flags is our direction.
>  - This kind of duplication of flag/status is very bad and
>   it's better to use status in 'struct page'.
>
> This patch is for removing page_cgroup's special flag for
> page-state accounting and for using 'struct page's status itself.

I think this patch itself doesn't remove any pc flags. I believe it is
on the following patch, which removes the PCG_FILE_MAPPED flag.

>
> This patch adds an atomic update support of page statistics accounting
> in memcg. In short, it prevents a page from being moved from a memcg
> to another while updating page status by...
>
>        locked = mem_cgroup_begin_update_page_stat(page)
>        modify page
>        mem_cgroup_update_page_stat(page)
>        mem_cgroup_end_update_page_stat(page, locked)
>
> While begin_update_page_stat() ... end_update_page_stat(),
> the page_cgroup will never be moved to other memcg.

This is nice.

In general, the description needs some work and it isn't clear to me
what this patch does at the first glance.

--Ying

>
> In usual case, overhead is rcu_read_lock() and rcu_read_unlock(),
> lookup_page_cgroup().
>
> Note:
>  - I still now considering how to reduce overhead of this scheme.
>   Good idea is welcomed.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
>  include/linux/memcontrol.h |   36 ++++++++++++++++++++++++++++++++++
>  mm/memcontrol.c            |   46 ++++++++++++++++++++++++++-----------------
>  mm/rmap.c                  |   14 +++++++++++-
>  3 files changed, 76 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4d34356..976b58c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -141,9 +141,35 @@ static inline bool mem_cgroup_disabled(void)
>        return false;
>  }
>
> +/*
> + * When we update page->flags,' we'll update some memcg's counter.
> + * Unlike vmstat, memcg has per-memcg stats and page-memcg relationship
> + * can be changed while 'struct page' information is updated.
> + * We need to prevent the race by
> + *     locked = mem_cgroup_begin_update_page_stat(page)
> + *     modify 'page'
> + *     mem_cgroup_update_page_stat(page, idx, val)
> + *     mem_cgroup_end_update_page_stat(page, locked);
> + */
> +bool __mem_cgroup_begin_update_page_stat(struct page *page);
> +static inline bool mem_cgroup_begin_update_page_stat(struct page *page)
> +{
> +       if (mem_cgroup_disabled())
> +               return false;
> +       return __mem_cgroup_begin_update_page_stat(page);
> +}
>  void mem_cgroup_update_page_stat(struct page *page,
>                                 enum mem_cgroup_page_stat_item idx,
>                                 int val);
> +void __mem_cgroup_end_update_page_stat(struct page *page, bool locked);
> +static inline void
> +mem_cgroup_end_update_page_stat(struct page *page, bool locked)
> +{
> +       if (mem_cgroup_disabled())
> +               return;
> +       __mem_cgroup_end_update_page_stat(page, locked);
> +}
> +
>
>  static inline void mem_cgroup_inc_page_stat(struct page *page,
>                                            enum mem_cgroup_page_stat_item idx)
> @@ -356,6 +382,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  {
>  }
>
> +static inline bool mem_cgroup_begin_update_page_stat(struct page *page)
> +{
> +       return false;
> +}
> +
> +static inline void
> +mem_cgroup_end_update_page_stat(struct page *page, bool locked)
> +{
> +}
> +
>  static inline void mem_cgroup_inc_page_stat(struct page *page,
>                                            enum mem_cgroup_page_stat_item idx)
>  {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 61e276f..30ef810 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1912,29 +1912,43 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
>  * possibility of race condition. If there is, we take a lock.
>  */
>
> +bool __mem_cgroup_begin_update_page_stat(struct page *page)
> +{
> +       struct page_cgroup *pc = lookup_page_cgroup(page);
> +       bool locked = false;
> +       struct mem_cgroup *memcg;
> +
> +       rcu_read_lock();
> +       memcg = pc->mem_cgroup;
> +
> +       if (!memcg || !PageCgroupUsed(pc))
> +               goto out;
> +       if (mem_cgroup_stealed(memcg)) {
> +               mem_cgroup_account_move_rlock(page);
> +               locked = true;
> +       }
> +out:
> +       return locked;
> +}
> +
> +void __mem_cgroup_end_update_page_stat(struct page *page, bool locked)
> +{
> +       if (locked)
> +               mem_cgroup_account_move_runlock(page);
> +       rcu_read_unlock();
> +}
> +
>  void mem_cgroup_update_page_stat(struct page *page,
>                                 enum mem_cgroup_page_stat_item idx, int val)
>  {
> -       struct mem_cgroup *memcg;
>        struct page_cgroup *pc = lookup_page_cgroup(page);
> -       bool need_unlock = false;
> +       struct mem_cgroup *memcg = pc->mem_cgroup;
>
>        if (mem_cgroup_disabled())
>                return;
>
> -       rcu_read_lock();
> -       memcg = pc->mem_cgroup;
>        if (unlikely(!memcg || !PageCgroupUsed(pc)))
> -               goto out;
> -       /* pc->mem_cgroup is unstable ? */
> -       if (unlikely(mem_cgroup_stealed(memcg))) {
> -               /* take a lock against to access pc->mem_cgroup */
> -               mem_cgroup_account_move_rlock(page);
> -               need_unlock = true;
> -               memcg = pc->mem_cgroup;
> -               if (!memcg || !PageCgroupUsed(pc))
> -                       goto out;
> -       }
> +               return;
>
>        switch (idx) {
>        case MEMCG_NR_FILE_MAPPED:
> @@ -1950,10 +1964,6 @@ void mem_cgroup_update_page_stat(struct page *page,
>
>        this_cpu_add(memcg->stat->count[idx], val);
>
> -out:
> -       if (unlikely(need_unlock))
> -               mem_cgroup_account_move_runlock(page);
> -       rcu_read_unlock();
>        return;
>  }
>  EXPORT_SYMBOL(mem_cgroup_update_page_stat);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index aa547d4..def60d1 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1150,10 +1150,13 @@ void page_add_new_anon_rmap(struct page *page,
>  */
>  void page_add_file_rmap(struct page *page)
>  {
> +       bool locked = mem_cgroup_begin_update_page_stat(page);
> +
>        if (atomic_inc_and_test(&page->_mapcount)) {
>                __inc_zone_page_state(page, NR_FILE_MAPPED);
>                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
>        }
> +       mem_cgroup_end_update_page_stat(page, locked);
>  }
>
>  /**
> @@ -1164,10 +1167,14 @@ void page_add_file_rmap(struct page *page)
>  */
>  void page_remove_rmap(struct page *page)
>  {
> +       bool locked = false;
> +
> +       if (!PageAnon(page))
> +               locked = mem_cgroup_begin_update_page_stat(page);
> +
>        /* page still mapped by someone else? */
>        if (!atomic_add_negative(-1, &page->_mapcount))
> -               return;
> -
> +               goto out;
>        /*
>         * Now that the last pte has gone, s390 must transfer dirty
>         * flag from storage key to struct page.  We can usually skip
> @@ -1204,6 +1211,9 @@ void page_remove_rmap(struct page *page)
>         * Leaving it set also helps swapoff to reinstate ptes
>         * faster for those pages still in swapcache.
>         */
> +out:
> +       if (!PageAnon(page))
> +               mem_cgroup_end_update_page_stat(page, locked);
>  }
>
>  /*
> --
> 1.7.4.1
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href