[PATCH] memcg: add pgfault latency histograms

Ying Han <yinghan@xxxxxxxxxx> · Thu, 26 May 2011 14:07:49 -0700

This adds histogram to capture pagefault latencies on per-memcg basis. I used
this patch on the memcg background reclaim test, and figured there could be more
usecases to monitor/debug application performance.

The histogram is composed 8 bucket in ns unit. The last one is infinite (inf)
which is everything beyond the last one. To be more flexible, the buckets can
be reset and also each bucket is configurable at runtime.

memory.pgfault_histogram: exports the histogram on per-memcg basis and also can
be reset by echoing "reset". Meantime, all the buckets are writable by echoing
the range into the API. see the example below.

/proc/sys/vm/pgfault_histogram: the global sysfs tunablecan be used to turn
on/off recording the histogram.

Functional Test:
Create a memcg with 10g hard_limit, running dd & allocate 8g anon page.
Measure the anon page allocation latency.

$ mkdir /dev/cgroup/memory/B
$ echo 10g >/dev/cgroup/memory/B/memory.limit_in_bytes
$ echo $$ >/dev/cgroup/memory/B/tasks
$ dd if=/dev/zero of=/export/hdc3/dd/tf0 bs=1024 count=20971520 &
$ allocate 8g anon pages

$ echo 1 >/proc/sys/vm/pgfault_histogram

$ cat /dev/cgroup/memory/B/memory.pgfault_histogram
pgfault latency histogram (ns):
< 600            2051273
< 1200           40859
< 2400           4004
< 4800           1605
< 9600           170
< 19200          82
< 38400          6
< inf            0

$ echo reset >/dev/cgroup/memory/B/memory.pgfault_histogram
$ cat /dev/cgroup/memory/B/memory.pgfault_histogram
pgfault latency histogram (ns):
< 600            0
< 1200           0
< 2400           0
< 4800           0
< 9600           0
< 19200          0
< 38400          0
< inf            0

$ echo 500 520 540 580 600 1000 5000 >/dev/cgroup/memory/B/memory.pgfault_histogram
$ cat /dev/cgroup/memory/B/memory.pgfault_histogram
pgfault latency histogram (ns):
< 500            50
< 520            151
< 540            3715
< 580            1859812
< 600            202241
< 1000           25394
< 5000           5875
< inf            186

Performance Test:
I ran through the PageFaultTest (pft) benchmark to measure the overhead of
recording the histogram. There is no overhead observed on both "flt/cpu/s"
and "fault/wsec".

$ mkdir /dev/cgroup/memory/A
$ echo 16g >/dev/cgroup/memory/A/memory.limit_in_bytes
$ echo $$ >/dev/cgroup/memory/A/tasks
$ ./pft -m 15g -t 8 -T a

Result:
"fault/wsec"

$ ./ministat no_histogram histogram
x no_histogram
+ histogram
+--------------------------------------------------------------------------+
   N           Min           Max        Median           Avg        Stddev
x   5     813404.51     824574.98      821661.3     820470.83     4202.0758
+   5     821228.91     825894.66     822874.65     823374.15     1787.9355

"flt/cpu/s"

$ ./ministat no_histogram histogram
x no_histogram
+ histogram
+--------------------------------------------------------------------------+
   N           Min           Max        Median           Avg        Stddev
x   5     104951.93     106173.13     105142.73      105349.2     513.78158
+   5     104697.67      105416.1     104943.52     104973.77     269.24781
No difference proven at 95.0% confidence

Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
---
 arch/x86/mm/fault.c        |    8 +++
 include/linux/memcontrol.h |    8 +++
 kernel/sysctl.c            |    7 +++
 mm/memcontrol.c            |  128 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f87..d7a1490 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <linux/memcontrol.h>
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -966,6 +967,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	int write = error_code & PF_WRITE;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
 					(write ? FAULT_FLAG_WRITE : 0);
+	unsigned long long start, delta;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1125,6 +1127,7 @@ good_area:
 		return;
 	}
 
+	start = sched_clock();
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -1132,6 +1135,11 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 
+	delta = sched_clock() - start;
+	if (unlikely(delta < 0))
+		delta = 0;
+	memcg_histogram_record(current, delta);
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 29a945a..c7e6cb8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -92,6 +92,8 @@ struct mem_cgroup *mem_cgroup_get_shrink_target(void);
 void mem_cgroup_put_shrink_target(struct mem_cgroup *mem);
 wait_queue_head_t *mem_cgroup_kswapd_waitq(void);
 
+extern void memcg_histogram_record(struct task_struct *tsk, u64 delta);
+
 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
@@ -131,6 +133,8 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern int do_swap_account;
 #endif
 
+extern unsigned int sysctl_pgfault_histogram;
+
 static inline bool mem_cgroup_disabled(void)
 {
 	if (mem_cgroup_subsys.disabled)
@@ -476,6 +480,10 @@ wait_queue_head_t *mem_cgroup_kswapd_waitq(void)
 	return NULL;
 }
 
+static inline
+void memcg_histogram_record(struct task_struct *tsk, u64 delta)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 927fc5a..0dd2939 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1132,6 +1132,13 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &one,
 		.extra2		= &three,
 	},
+	{
+		.procname	= "pgfault_histogram",
+		.data		= &sysctl_pgfault_histogram,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0666,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_COMPACTION
 	{
 		.procname	= "compact_memory",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a98471b..c795f96 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
 #include "internal.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/ctype.h>
 
 #include <asm/uaccess.h>
 
@@ -207,6 +208,13 @@ struct mem_cgroup_eventfd_list {
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 
+#define MEMCG_NUM_HISTO_BUCKETS		8
+unsigned int sysctl_pgfault_histogram;
+
+struct memcg_histo {
+	u64 count[MEMCG_NUM_HISTO_BUCKETS];
+};
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -299,6 +307,9 @@ struct mem_cgroup {
 	 * last node we reclaimed from
 	 */
 	int last_scanned_node;
+
+	struct memcg_histo *memcg_histo;
+	u64 memcg_histo_range[MEMCG_NUM_HISTO_BUCKETS];
 };
 
 /* Stuffs for move charges at task migration. */
@@ -4692,6 +4703,105 @@ static int __init memcg_kswapd_init(void)
 }
 module_init(memcg_kswapd_init);
 
+static int mem_cgroup_histogram_seq_read(struct cgroup *cgrp,
+					struct cftype *cft, struct seq_file *m)
+{
+	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cgrp);
+	int i, cpu;
+
+	seq_printf(m, "pgfault latency histogram (ns):\n");
+
+	for (i = 0; i < MEMCG_NUM_HISTO_BUCKETS; i++) {
+		u64 sum = 0;
+
+		for_each_present_cpu(cpu) {
+			struct memcg_histo *histo;
+			histo = per_cpu_ptr(mem_cont->memcg_histo, cpu);
+			sum += histo->count[i];
+		}
+
+		if (i < MEMCG_NUM_HISTO_BUCKETS - 1)
+			seq_printf(m, "< %-15llu",
+					mem_cont->memcg_histo_range[i]);
+		else
+			seq_printf(m, "< %-15s", "inf");
+		seq_printf(m, "%llu\n", sum);
+	}
+
+	return 0;
+}
+
+static int mem_cgroup_histogram_seq_write(struct cgroup *cgrp,
+					struct cftype *cft, const char *buffer)
+{
+	int i;
+	u64 data[MEMCG_NUM_HISTO_BUCKETS];
+	char *end;
+	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cgrp);
+
+	if (!memcmp(buffer, "reset", 5)) {
+		for_each_present_cpu(i) {
+			struct memcg_histo *histo;
+
+			histo = per_cpu_ptr(mem_cont->memcg_histo, i);
+			memset(histo, 0, sizeof(*histo));
+		}
+		goto out;
+	}
+
+	for (i = 0; i < MEMCG_NUM_HISTO_BUCKETS - 1; i++, buffer = end) {
+		while ((isspace(*buffer)))
+			buffer++;
+		data[i] = simple_strtoull(buffer, &end, 10);
+	}
+	data[i] = ULLONG_MAX;
+
+	for (i = 1; i < MEMCG_NUM_HISTO_BUCKETS; i++)
+		if (data[i] < data[i - 1])
+			return -EINVAL;
+
+	memcpy(mem_cont->memcg_histo_range, data, sizeof(data));
+	for_each_present_cpu(i) {
+		struct memcg_histo *histo;
+		histo = per_cpu_ptr(mem_cont->memcg_histo, i);
+		memset(histo->count, 0, sizeof(*histo));
+	}
+out:
+	return 0;
+}
+
+/*
+ * Record values into histogram buckets
+ */
+void memcg_histogram_record(struct task_struct *tsk, u64 delta)
+{
+	u64 *base;
+	int index, first, last;
+	struct memcg_histo *histo;
+	struct mem_cgroup *mem = mem_cgroup_from_task(tsk);
+
+	if (sysctl_pgfault_histogram == 0)
+		return;
+
+	first = 0;
+	last = MEMCG_NUM_HISTO_BUCKETS - 1;
+	base = mem->memcg_histo_range;
+
+	if (delta >= base[first]) {
+		while (first < last) {
+			index = (first + last) / 2;
+			if (delta >= base[index])
+				first = index + 1;
+			else
+				last = index;
+		}
+	}
+	index = first;
+
+	histo = per_cpu_ptr(mem->memcg_histo, smp_processor_id());
+	histo->count[index]++;
+}
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4769,6 +4879,12 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "reclaim_wmarks",
 		.read_map = mem_cgroup_wmark_read,
 	},
+	{
+		.name = "pgfault_histogram",
+		.read_seq_string = mem_cgroup_histogram_seq_read,
+		.write_string = mem_cgroup_histogram_seq_write,
+		.max_write_len = 256,
+	},
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4903,6 +5019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
 		free_mem_cgroup_per_zone_info(mem, node);
 
 	free_percpu(mem->stat);
+	free_percpu(mem->memcg_histo);
 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
 		kfree(mem);
 	else
@@ -5014,6 +5131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	struct mem_cgroup *mem, *parent;
 	long error = -ENOMEM;
 	int node;
+	int i;
 
 	mem = mem_cgroup_alloc();
 	if (!mem)
@@ -5068,6 +5186,16 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mutex_init(&mem->thresholds_lock);
 	init_waitqueue_head(&mem->memcg_kswapd_end);
 	INIT_LIST_HEAD(&mem->memcg_kswapd_wait_list);
+
+	mem->memcg_histo = alloc_percpu(typeof(*mem->memcg_histo));
+	if (!mem->memcg_histo)
+		goto free_out;
+
+
+	for (i = 0; i < MEMCG_NUM_HISTO_BUCKETS - 1; i++)
+		mem->memcg_histo_range[i] = (1 << i) * 600ULL;
+	mem->memcg_histo_range[i] = ULLONG_MAX;
+
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
-- 
1.7.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>