Re: [PATCH 8/7] memcg : reclaim statistics

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On Mon, Apr 25, 2011 at 2:43 AM, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
At tuning memcg background reclaim, cpu usage per memcg's work is an
interesting information because some amount of shared resource is used.
(i.e. background reclaim uses workqueue.) And other information as
pgscan and pgreclaim is important.

This patch shows them via memory.stat with cpu usage for direct reclaim
and softlimit reclaim and page scan statistics.


 # cat /cgroup/memory/A/memory.stat
 ....
 direct_elapsed_ns 0
 soft_elapsed_ns 0
 wmark_elapsed_ns 103566424
 direct_scanned 0
 soft_scanned 0
 wmark_scanned 29303
 direct_freed 0
 soft_freed 0
 wmark_freed 29290


Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
 Documentation/cgroups/memory.txt |   18 +++++++++
 include/linux/memcontrol.h       |    6 +++
 include/linux/swap.h             |    7 +++
 mm/memcontrol.c                  |   77 +++++++++++++++++++++++++++++++++++++--
 mm/vmscan.c                      |   15 +++++++
 5 files changed, 120 insertions(+), 3 deletions(-)

Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -274,6 +274,17 @@ struct mem_cgroup {
       bool                    bgreclaim_resched;
       struct delayed_work     bgreclaim_work;
       /*
+        * reclaim statistics (not per zone, node)
+        */
+       spinlock_t              elapsed_lock;
+       u64                     bgreclaim_elapsed;
+       u64                     direct_elapsed;
+       u64                     soft_elapsed;
+
+       u64                     reclaim_scan[NR_RECLAIM_CONTEXTS];
+       u64                     reclaim_freed[NR_RECLAIM_CONTEXTS];
+
+       /*
        * Should we move charges of a task when a task is moved into this
        * mem_cgroup ? And what type of charges should we move ?
        */
@@ -1346,6 +1357,18 @@ void mem_cgroup_clear_unreclaimable(stru
       return;
 }

+void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem,
+               int context, unsigned long scanned,
+               unsigned long freed)
+{
+       if (!mem)
+               return;
+       spin_lock(&mem->elapsed_lock);
+       mem->reclaim_scan[context] += scanned;
+       mem->reclaim_freed[context] += freed;
+       spin_unlock(&mem->elapsed_lock);
+}
+
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                       struct list_head *dst,
                                       unsigned long *scanned, int order,
@@ -1692,6 +1715,7 @@ static int mem_cgroup_hierarchical_recla
       bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
       unsigned long excess;
       unsigned long nr_scanned;
+       s64 start, end;

       excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

@@ -1735,16 +1759,27 @@ static int mem_cgroup_hierarchical_recla
               }
               /* we use swappiness of local cgroup */
               if (check_soft) {
+                       start = sched_clock();
                       ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
                               noswap, mem_cgroup_swappiness(victim), zone,
                               &nr_scanned);
                       *total_scanned += nr_scanned;
+                       end = sched_clock();
+                       spin_lock(&victim->elapsed_lock);
+                       victim->soft_elapsed += end - start;
+                       spin_unlock(&victim->elapsed_lock);
                       mem_cgroup_soft_steal(victim, ret);
                       mem_cgroup_soft_scan(victim, nr_scanned);
-               } else
+               } else {
+                       start = sched_clock();
                       ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                               noswap,
                                               mem_cgroup_swappiness(victim));
+                       end = sched_clock();
+                       spin_lock(&victim->elapsed_lock);
+                       victim->direct_elapsed += end - start;
+                       spin_unlock(&victim->elapsed_lock);
+               }
               css_put(&victim->css);
               /*
                * At shrinking usage, we can't check we should stop here or
@@ -3702,15 +3737,22 @@ static void memcg_bgreclaim(struct work_
       struct delayed_work *dw = to_delayed_work(work);
       struct mem_cgroup *mem =
               container_of(dw, struct mem_cgroup, bgreclaim_work);
-       int delay = 0;
+       int delay;
       unsigned long long required, usage, hiwat;

+       delay = 0;
       hiwat = res_counter_read_u64(&mem->res, RES_HIGH_WMARK_LIMIT);
       usage = res_counter_read_u64(&mem->res, RES_USAGE);
       required = usage - hiwat;
       if (required >= 0)  {
+               u64 start, end;
               required = ((usage - hiwat) >> PAGE_SHIFT) + 1;
+               start = sched_clock();
               delay = shrink_mem_cgroup(mem, (long)required);
+               end = sched_clock();
+               spin_lock(&mem->elapsed_lock);
+               mem->bgreclaim_elapsed += end - start;
+               spin_unlock(&mem->elapsed_lock);
       }
       if (!mem->bgreclaim_resched  ||
               mem_cgroup_watermark_ok(mem, CHARGE_WMARK_HIGH)) {
@@ -4152,6 +4194,15 @@ enum {
       MCS_INACTIVE_FILE,
       MCS_ACTIVE_FILE,
       MCS_UNEVICTABLE,
+       MCS_DIRECT_ELAPSED,
+       MCS_SOFT_ELAPSED,
+       MCS_WMARK_ELAPSED,
+       MCS_DIRECT_SCANNED,
+       MCS_SOFT_SCANNED,
+       MCS_WMARK_SCANNED,
+       MCS_DIRECT_FREED,
+       MCS_SOFT_FREED,
+       MCS_WMARK_FREED,
       NR_MCS_STAT,
 };

@@ -4177,7 +4228,16 @@ struct {
       {"active_anon", "total_active_anon"},
       {"inactive_file", "total_inactive_file"},
       {"active_file", "total_active_file"},
-       {"unevictable", "total_unevictable"}
+       {"unevictable", "total_unevictable"},
+       {"direct_elapsed_ns", "total_direct_elapsed_ns"},
+       {"soft_elapsed_ns", "total_soft_elapsed_ns"},
+       {"wmark_elapsed_ns", "total_wmark_elapsed_ns"},
+       {"direct_scanned", "total_direct_scanned"},
+       {"soft_scanned", "total_soft_scanned"},
+       {"wmark_scanned", "total_wmark_scanned"},
+       {"direct_freed", "total_direct_freed"},
+       {"soft_freed", "total_soft_freed"},
+       {"wmark_freed", "total_wamrk_freed"}
 };


@@ -4185,6 +4245,7 @@ static void
 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
       s64 val;
+       int i;

       /* per cpu stat */
       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -4221,6 +4282,15 @@ mem_cgroup_get_local_stat(struct mem_cgr
       s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
       val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
       s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
+
+       /* reclaim stats */
+       s->stat[MCS_DIRECT_ELAPSED] += mem->direct_elapsed;
+       s->stat[MCS_SOFT_ELAPSED] += mem->soft_elapsed;
+       s->stat[MCS_WMARK_ELAPSED] += mem->bgreclaim_elapsed;
+       for (i = 0; i < NR_RECLAIM_CONTEXTS; i++) {
+               s->stat[i + MCS_DIRECT_SCANNED] += mem->reclaim_scan[i];
+               s->stat[i + MCS_DIRECT_FREED] += mem->reclaim_freed[i];
+       }
 }

 static void
@@ -4889,6 +4959,7 @@ static struct mem_cgroup *mem_cgroup_all
               goto out_free;
       spin_lock_init(&mem->pcp_counter_lock);
       INIT_DELAYED_WORK(&mem->bgreclaim_work, memcg_bgreclaim);
+       spin_lock_init(&mem->elapsed_lock);
       mem->bgreclaim_resched = true;
       return mem;

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -90,6 +90,8 @@ extern int mem_cgroup_select_victim_node
                                       const nodemask_t *nodes);

 int shrink_mem_cgroup(struct mem_cgroup *mem, long required);
+void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem, int context,
+                       unsigned long scanned, unsigned long freed);

 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
@@ -423,6 +425,10 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem, int context,
+                               unsigned long scanned, unsigned long freed)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */

 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
Index: memcg/include/linux/swap.h
===================================================================
--- memcg.orig/include/linux/swap.h
+++ memcg/include/linux/swap.h
@@ -250,6 +250,13 @@ static inline void lru_cache_add_file(st
 #define ISOLATE_ACTIVE 1       /* Isolate active pages. */
 #define ISOLATE_BOTH 2         /* Isolate both active and inactive pages. */

+/* context for memory reclaim.( comes from memory cgroup.) */
+enum {
+       RECLAIM_DIRECT,         /* under direct reclaim */
+       RECLAIM_KSWAPD,         /* under global kswapd's soft limit */
+       RECLAIM_WMARK,          /* under background reclaim by watermark */
+       NR_RECLAIM_CONTEXTS
+};
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                       gfp_t gfp_mask, nodemask_t *mask);
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -72,6 +72,9 @@ typedef unsigned __bitwise__ reclaim_mod
 #define RECLAIM_MODE_LUMPYRECLAIM      ((__force reclaim_mode_t)0x08u)
 #define RECLAIM_MODE_COMPACTION                ((__force reclaim_mode_t)0x10u)

+/* 3 reclaim contexts fro memcg statistics. */
+enum {DIRECT_RECLAIM, KSWAPD_RECLAIM, WMARK_RECLAIM};
+
 struct scan_control {
       /* Incremented by the number of inactive pages that were scanned */
       unsigned long nr_scanned;
@@ -107,6 +110,7 @@ struct scan_control {

       /* Which cgroup do we reclaim from */
       struct mem_cgroup *mem_cgroup;
+       int     reclaim_context;

       /*
        * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -2116,6 +2120,10 @@ out:
       delayacct_freepages_end();
       put_mems_allowed();

+       if (!scanning_global_lru(sc))
+               mem_cgroup_reclaim_statistics(sc->mem_cgroup,
+                       sc->reclaim_context, total_scanned, sc->nr_reclaimed);
+
       if (sc->nr_reclaimed)
               return sc->nr_reclaimed;

@@ -2178,6 +2186,7 @@ unsigned long mem_cgroup_shrink_node_zon
               .swappiness = swappiness,
               .order = 0,
               .mem_cgroup = mem,
+               .reclaim_context = RECLAIM_KSWAPD,
       };

       sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2198,6 +2207,8 @@ unsigned long mem_cgroup_shrink_node_zon

       trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

+       mem_cgroup_reclaim_statistics(sc.mem_cgroup,
+                       sc.reclaim_context, sc.nr_scanned, sc.nr_reclaimed);
       *nr_scanned = sc.nr_scanned;
       return sc.nr_reclaimed;
 }
@@ -2217,6 +2228,7 @@ unsigned long try_to_free_mem_cgroup_pag
               .swappiness = swappiness,
               .order = 0,
               .mem_cgroup = mem_cont,
+               .reclaim_context = RECLAIM_DIRECT,
               .nodemask = NULL, /* we don't care the placement */
       };

@@ -2384,6 +2396,7 @@ int shrink_mem_cgroup(struct mem_cgroup
               .may_swap = 1,
               .order = 0,
               .mem_cgroup = mem,
+               .reclaim_context = RECLAIM_WMARK,
       };
       /* writepage will be set later per zone */
       sc.may_writepage = 0;
@@ -2434,6 +2447,8 @@ int shrink_mem_cgroup(struct mem_cgroup
       if (sc.nr_reclaimed > sc.nr_to_reclaim/2)
               delay = 0;
 out:
+       mem_cgroup_reclaim_statistics(sc.mem_cgroup, sc.reclaim_context,
+                       total_scanned, sc.nr_reclaimed);
       current->flags &= ~PF_SWAPWRITE;
       return delay;
 }
Index: memcg/Documentation/cgroups/memory.txt
===================================================================
--- memcg.orig/Documentation/cgroups/memory.txt
+++ memcg/Documentation/cgroups/memory.txt
@@ -398,6 +398,15 @@ active_anon        - # of bytes of anonymous an
 inactive_file  - # of bytes of file-backed memory on inactive LRU list.
 active_file    - # of bytes of file-backed memory on active LRU list.
 unevictable    - # of bytes of memory that cannot be reclaimed (mlocked etc).
+direct_elapsed_ns  - # of elapsed cpu time at hard limit reclaim (ns)
+soft_elapsed_ns  - # of elapsed cpu time at soft limit reclaim (ns)
+wmark_elapsed_ns  - # of elapsed cpu time at hi/low watermark reclaim (ns)
+direct_scanned - # of page scans at hard limit reclaim
+soft_scanned   - # of page scans at soft limit reclaim
+wmark_scanned  - # of page scans at hi/low watermark reclaim
+direct_freed   - # of page freeing at hard limit reclaim
+soft_freed     - # of page freeing at soft limit reclaim
+wmark_freed    - # of page freeing at hi/low watermark reclaim

 # status considering hierarchy (see memory.use_hierarchy settings)

@@ -421,6 +430,15 @@ total_active_anon  - sum of all children'
 total_inactive_file    - sum of all children's "inactive_file"
 total_active_file      - sum of all children's "active_file"
 total_unevictable      - sum of all children's "unevictable"
+total_direct_elapsed_ns - sum of all children's "direct_elapsed_ns"
+total_soft_elapsed_ns  - sum of all children's "soft_elapsed_ns"
+total_wmark_elapsed_ns - sum of all children's "wmark_elapsed_ns"
+total_direct_scanned   - sum of all children's "direct_scanned"
+total_soft_scanned     - sum of all children's "soft_scanned"
+total_wmark_scanned    - sum of all children's "wmark_scanned"
+total_direct_freed     - sum of all children's "direct_freed"
+total_soft_freed       - sum of all children's "soft_freed"
+total_wamrk_freed      - sum of all children's "wmark_freed"

 # The following additional stats are dependent on CONFIG_DEBUG_VM.

Those stats looks good to me. Thanks

--Ying 


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]