[PATCH 3/4] tools/vm/page-types.c: rework on file cache scanning mode

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch reworks on the file cache scanning mode of page-types tool,
where when page-types is called with -f <filepath>, it can scan pages
in page cache tree of the specified file via /proc/kpagecache interface.

In the original implementation, it did mmap/madvise/mincore/pagemap over
page cache of the target file(s), so it gives us much measurement-disturbance.
This patch avoids this by using /proc/kpagecache.
And page-types does recursive walking when -f option specifies a directory,
which is too much, so let's keep it compact for code maintenability.
We can do the similar thing more flexibly for example by the following:

  find /tmp | \
      while read f ; do tools/vm/page-types -f $f ; done | \
      grep 0x | tr -s '\t' ' ' | awk '
    {
      label = $4;
      arr[label] = arr[label] + $2;
    }
    END {
      for ( a in arr ) {
        printf("%s %ld\n", a, arr[a]);
      }
    }
  '

This code gets page status summary of all files under /tmp, whose output
is like this:

  __RUDl________b_____________________ 2   # page count
  __RUDlA_______b_____________________ 4

ChangeLog:
- rebased onto v3.15-rc5 (resolved conflict with Konstantins patch
  commit 65a6a4105f "tools/vm/page-types.c: page-cache sniffing feature")

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
 tools/vm/page-types.c | 276 +++++++++++++++++++++-----------------------------
 1 file changed, 114 insertions(+), 162 deletions(-)

diff --git v3.15-rc5.orig/tools/vm/page-types.c v3.15-rc5/tools/vm/page-types.c
index 05654f5e48d5..a0fb55489ea7 100644
--- v3.15-rc5.orig/tools/vm/page-types.c
+++ v3.15-rc5/tools/vm/page-types.c
@@ -30,14 +30,12 @@
 #include <getopt.h>
 #include <limits.h>
 #include <assert.h>
-#include <ftw.h>
-#include <time.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/statfs.h>
-#include <sys/mman.h>
 #include "../../include/uapi/linux/magic.h"
 #include "../../include/uapi/linux/kernel-page-flags.h"
 #include <api/fs/debugfs.h>
@@ -79,6 +77,7 @@
 
 #define KPF_BYTES		8
 #define PROC_KPAGEFLAGS		"/proc/kpageflags"
+#define PROC_KPAGECACHE		"/proc/kpagecache"
 
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED		32
@@ -162,7 +161,7 @@ static int		opt_raw;	/* for kernel developers */
 static int		opt_list;	/* list pages (in ranges) */
 static int		opt_no_summary;	/* don't show summary */
 static pid_t		opt_pid;	/* process to walk */
-const char *		opt_file;
+static char		*opt_file;	/* walk over pagecache of file */
 
 #define MAX_ADDR_RANGES	1024
 static int		nr_addr_ranges;
@@ -183,6 +182,7 @@ static int		page_size;
 
 static int		pagemap_fd;
 static int		kpageflags_fd;
+static int		kpagecache_fd;
 
 static int		opt_hwpoison;
 static int		opt_unpoison;
@@ -276,6 +276,13 @@ static unsigned long kpageflags_read(uint64_t *buf,
 	return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
 }
 
+static unsigned long kpagecache_read(uint64_t *buf,
+				     unsigned long index,
+				     unsigned long pages)
+{
+	return do_u64_read(kpagecache_fd, PROC_KPAGECACHE, buf, index, pages);
+}
+
 static unsigned long pagemap_read(uint64_t *buf,
 				  unsigned long index,
 				  unsigned long pages)
@@ -338,53 +345,62 @@ static char *page_flag_longname(uint64_t flags)
 	return buf;
 }
 
+#define __NR_PAGECACHE_TAGS	3
+#define KPC_TAGS_BITS	__NR_PAGECACHE_TAGS
+#define KPC_TAGS_OFFSET	(64 - KPC_TAGS_BITS)
+#define KPC_TAGS_MASK	(((1ULL << KPC_TAGS_BITS) - 1) << KPC_TAGS_OFFSET)
+#define KPC_TAGS(entry)	((entry & KPC_TAGS_MASK) >> KPC_TAGS_OFFSET)
 
 /*
  * page list and summary
  */
 
-static void show_page_range(unsigned long voffset, unsigned long offset,
-			    unsigned long size, uint64_t flags)
+static void show_page_range(unsigned long voffset,
+			unsigned long offset, uint64_t flags, uint64_t entry)
 {
 	static uint64_t      flags0;
 	static unsigned long voff;
 	static unsigned long index;
 	static unsigned long count;
+	static uint64_t	     entry0;
 
 	if (flags == flags0 && offset == index + count &&
-	    size && voffset == voff + count) {
-		count += size;
+	    (!opt_pid || voffset == voff + count) &&
+	    (!opt_file || (voffset == voff + count && entry == entry0))) {
+		count++;
 		return;
 	}
 
 	if (count) {
 		if (opt_pid)
-			printf("%lx\t", voff);
-		if (opt_file)
-			printf("%lu\t", voff);
-		printf("%lx\t%lx\t%s\n",
-				index, count, page_flag_name(flags0));
+			printf("%lx\t%lx\t%lx\t%s\n",
+			       voff, index, count, page_flag_name(flags0));
+		else if (opt_file)
+			printf("%lx\t%lx\t%lx\t%llx\t%s\n",
+			       voff, index, count, KPC_TAGS(entry0), page_flag_name(flags0));
+		else
+			printf("%lx\t%lx\t%s\n",
+			       index, count, page_flag_name(flags0));
 	}
 
 	flags0 = flags;
 	index  = offset;
 	voff   = voffset;
-	count  = size;
-}
-
-static void flush_page_range(void)
-{
-	show_page_range(0, 0, 0, 0);
+	count  = 1;
+	entry0 = entry;
 }
 
 static void show_page(unsigned long voffset,
-		      unsigned long offset, uint64_t flags)
+		      unsigned long offset, uint64_t flags, uint64_t entry)
 {
 	if (opt_pid)
-		printf("%lx\t", voffset);
-	if (opt_file)
-		printf("%lu\t", voffset);
-	printf("%lx\t%s\n", offset, page_flag_name(flags));
+		printf("%lx\t%lx\t%s\n",
+		       voffset, offset, page_flag_name(flags));
+	else if (opt_file)
+		printf("%lx\t%lx\t%llx\t%s\n",
+		       voffset, offset, KPC_TAGS(entry), page_flag_name(flags));
+	else
+		printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 
 static void show_summary(void)
@@ -574,9 +590,9 @@ static void add_page(unsigned long voffset,
 		unpoison_page(offset);
 
 	if (opt_list == 1)
-		show_page_range(voffset, offset, 1, flags);
+		show_page_range(voffset, offset, flags, pme);
 	else if (opt_list == 2)
-		show_page(voffset, offset, flags);
+		show_page(voffset, offset, flags, pme);
 
 	nr_pages[hash_slot(flags)]++;
 	total_pages++;
@@ -655,6 +671,40 @@ static void walk_task(unsigned long index, unsigned long count)
 	}
 }
 
+struct stat kpagecache_stat;
+
+#define KPAGECACHE_BATCH	(64 << 10)	/* 64k pages */
+static void walk_file(unsigned long index, unsigned long count)
+{
+	uint64_t buf[KPAGECACHE_BATCH];
+	unsigned long batch;
+	unsigned long pages;
+	unsigned long pfn;
+	unsigned long i;
+	unsigned long end_index = count;
+	unsigned long size;
+
+	stat(opt_file, &kpagecache_stat);
+	size = kpagecache_stat.st_size;
+	if (size > 0)
+		size = (size - 1) / 4096;
+	end_index = min_t(unsigned long, index + count - 1, size);
+	while (index <= end_index) {
+		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+		pages = kpagecache_read(buf, index, batch);
+		if (pages == 0)
+			break;
+		for (i = 0; i < pages; i++) {
+			pfn = buf[i] & ((1UL << 52) - 1UL);
+			if (pfn)
+				walk_pfn(index + i, pfn, 1, buf[i]);
+		}
+
+		index += pages;
+		count -= pages;
+	}
+}
+
 static void add_addr_range(unsigned long offset, unsigned long size)
 {
 	if (nr_addr_ranges >= MAX_ADDR_RANGES)
@@ -675,10 +725,12 @@ static void walk_addr_ranges(void)
 		add_addr_range(0, ULONG_MAX);
 
 	for (i = 0; i < nr_addr_ranges; i++)
-		if (!opt_pid)
-			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
-		else
+		if (opt_pid)
 			walk_task(opt_offset[i], opt_size[i]);
+		else if (opt_file)
+			walk_file(opt_offset[i], opt_size[i]);
+		else
+			walk_pfn(0, opt_offset[i], opt_size[i], 0);
 
 	close(kpageflags_fd);
 }
@@ -806,130 +858,21 @@ static void parse_pid(const char *str)
 	fclose(file);
 }
 
-static void show_file(const char *name, const struct stat *st)
-{
-	unsigned long long size = st->st_size;
-	char atime[64], mtime[64];
-	long now = time(NULL);
-
-	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
-			name, (unsigned)st->st_ino,
-			size, (size + page_size - 1) / page_size);
-
-	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
-	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
-
-	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
-			mtime, now - st->st_mtime,
-			atime, now - st->st_atime);
-}
-
-static void walk_file(const char *name, const struct stat *st)
-{
-	uint8_t vec[PAGEMAP_BATCH];
-	uint64_t buf[PAGEMAP_BATCH], flags;
-	unsigned long nr_pages, pfn, i;
-	int fd;
-	off_t off;
-	ssize_t len;
-	void *ptr;
-	int first = 1;
-
-	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
-
-	for (off = 0; off < st->st_size; off += len) {
-		nr_pages = (st->st_size - off + page_size - 1) / page_size;
-		if (nr_pages > PAGEMAP_BATCH)
-			nr_pages = PAGEMAP_BATCH;
-		len = nr_pages * page_size;
-
-		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
-		if (ptr == MAP_FAILED)
-			fatal("mmap failed: %s", name);
-
-		/* determine cached pages */
-		if (mincore(ptr, len, vec))
-			fatal("mincore failed: %s", name);
-
-		/* turn off readahead */
-		if (madvise(ptr, len, MADV_RANDOM))
-			fatal("madvice failed: %s", name);
-
-		/* populate ptes */
-		for (i = 0; i < nr_pages ; i++) {
-			if (vec[i] & 1)
-				(void)*(volatile int *)(ptr + i * page_size);
-		}
-
-		/* turn off harvesting reference bits */
-		if (madvise(ptr, len, MADV_SEQUENTIAL))
-			fatal("madvice failed: %s", name);
-
-		if (pagemap_read(buf, (unsigned long)ptr / page_size,
-					nr_pages) != nr_pages)
-			fatal("cannot read pagemap");
-
-		munmap(ptr, len);
-
-		for (i = 0; i < nr_pages; i++) {
-			pfn = pagemap_pfn(buf[i]);
-			if (!pfn)
-				continue;
-			if (!kpageflags_read(&flags, pfn, 1))
-				continue;
-			if (first && opt_list) {
-				first = 0;
-				flush_page_range();
-				show_file(name, st);
-			}
-			add_page(off / page_size + i, pfn, flags, buf[i]);
-		}
-	}
-
-	close(fd);
-}
-
-int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
-{
-	(void)f;
-	switch (type) {
-	case FTW_F:
-		if (S_ISREG(st->st_mode))
-			walk_file(name, st);
-		break;
-	case FTW_DNR:
-		fprintf(stderr, "cannot read dir: %s\n", name);
-		break;
-	}
-	return 0;
-}
-
-static void walk_page_cache(void)
+static void parse_file(const char *name)
 {
-	struct stat st;
-
-	kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
-	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
-
-	if (stat(opt_file, &st))
-		fatal("stat failed: %s\n", opt_file);
-
-	if (S_ISREG(st.st_mode)) {
-		walk_file(opt_file, &st);
-	} else if (S_ISDIR(st.st_mode)) {
-		/* do not follow symlinks and mountpoints */
-		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
-			fatal("nftw failed: %s\n", opt_file);
-	} else
-		fatal("unhandled file type: %s\n", opt_file);
-
-	close(kpageflags_fd);
-	close(pagemap_fd);
+	int ret;
+	opt_file = (char *)name;
+	kpagecache_fd = checked_open(PROC_KPAGECACHE, O_RDWR);
+	ret = write(kpagecache_fd, name, strlen(name));
+	if (ret != (int)strlen(name))
+		fatal("Failed to set file on %s\n", PROC_KPAGECACHE);
 }
 
-static void parse_file(const char *name)
+static void close_kpagecache(void)
 {
-	opt_file = name;
+	/* Reset in-kernel configuration. */
+	write(kpagecache_fd, NULL, 1);
+	close(kpagecache_fd);
 }
 
 static void parse_addr_range(const char *optarg)
@@ -1118,22 +1061,31 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	if (opt_list && opt_pid)
-		printf("voffset\t");
-	if (opt_list && opt_file)
-		printf("foffset\t");
-	if (opt_list == 1)
-		printf("offset\tlen\tflags\n");
-	if (opt_list == 2)
-		printf("offset\tflags\n");
+	if (opt_pid && opt_file) {
+		fprintf(stderr,
+		"Option -p and -f are mutually exclusive. Don't set both.\n");
+		exit(1);
+	}
 
-	if (opt_file)
-		walk_page_cache();
-	else
-		walk_addr_ranges();
+	if (opt_pid) {
+		if (opt_list == 1)
+			printf("voffset\toffset\tlen\tflags\n");
+		if (opt_list == 2)
+			printf("voffset\toffset\tflags\n");
+	} else if (opt_file) {
+		if (opt_list == 1)
+			printf("voffset\toffset\tlen\ttag\tflags\n");
+		if (opt_list == 2)
+			printf("voffset\toffset\ttag\tflags\n");
+	}
+
+	walk_addr_ranges();
 
 	if (opt_list == 1)
-		flush_page_range();
+		show_page_range(0, 0, 0, 0);  /* drain the buffer */
+
+	if (opt_file)
+		close_kpagecache();
 
 	if (opt_no_summary)
 		return 0;
-- 
1.9.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]