Re: [RFC 1/3] /dev/low_mem_notify

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello,

Ok, so here's a proof of concept patch that implements sample-base per-process free threshold VM event watching using perf-like syscall ABI. I'd really like to see something like this that's much more extensible and clean than the /dev based ABIs that people have proposed so far.

			Pekka

------------------->

From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@xxxxxxxxxx>
Date: Tue, 17 Jan 2012 17:51:48 +0200
Subject: [PATCH] vmnotify: VM event notification system

This patch implements a new sys_vmnotify_fd() system call that returns a
pollable file descriptor that can be used to watch VM events.

For example, to watch for VM event when free memory is below 99% of available
memory using 1 second sample period, you'd do something like this:

    struct vmnotify_config config;
    struct vmnotify_event event;
    struct pollfd pollfd;
    int fd;

    config = (struct vmnotify_config) {
            .type                   = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
            .sample_period_ns       = 1000000000L,
            .free_threshold         = 99,
    };

    fd = sys_vmnotify_fd(&config);

    pollfd.fd               = fd;
    pollfd.events           = POLLIN;

    if (poll(&pollfd, 1, -1) < 0) {
            perror("poll failed");
            exit(1);
    }

    memset(&event, 0, sizeof(event));

    if (read(fd, &event, sizeof(event)) < 0) {
            perror("read failed");
            exit(1);
    }

Signed-off-by: Pekka Enberg <penberg@xxxxxxxxxx>
---
 arch/x86/include/asm/unistd_64.h       |    2 +
 include/linux/vmnotify.h               |   44 ++++++
 mm/Kconfig                             |    6 +
 mm/Makefile                            |    1 +
 mm/vmnotify.c                          |  235 ++++++++++++++++++++++++++++++++
 tools/testing/vmnotify/vmnotify-test.c |   68 +++++++++
 6 files changed, 356 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/vmnotify.h
 create mode 100644 mm/vmnotify.c
 create mode 100644 tools/testing/vmnotify/vmnotify-test.c

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 0431f19..b0928cd 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
 __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev			311
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_vmnotify_fd			312
+__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)

 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
new file mode 100644
index 0000000..8f8642b
--- /dev/null
+++ b/include/linux/vmnotify.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_VMNOTIFY_H
+#define _LINUX_VMNOTIFY_H
+
+#include <linux/types.h>
+
+enum {
+	VMNOTIFY_TYPE_FREE_THRESHOLD	= 1ULL << 0,
+	VMNOTIFY_TYPE_SAMPLE		= 1ULL << 1,
+};
+
+struct vmnotify_config {
+	/*
+	 * Size of the struct for ABI extensibility.
+	 */
+	__u32		   size;
+
+	/*
+	 * Notification type bitmask
+	 */
+	__u64			type;
+
+	/*
+	 * Free memory threshold in percentages [1..99]
+	 */
+	__u32			free_threshold;
+
+	/*
+	 * Sample period in nanoseconds
+	 */
+	__u64			sample_period_ns;
+};
+
+struct vmnotify_event {
+	/* Size of the struct for ABI extensibility. */
+	__u32			size;
+
+	__u64			nr_avail_pages;
+
+	__u64			nr_swap_pages;
+
+	__u64			nr_free_pages;
+};
+
+#endif /* _LINUX_VMNOTIFY_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 011b110..6631167 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -373,3 +373,9 @@ config CLEANCACHE
 	  in a negligible performance hit.

 	  If unsure, say Y to enable cleancache
+
+config VMNOTIFY
+	bool "Enable VM event notification system"
+	default n
+	help
+	  If unsure, say N to disable vmnotify
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..e1b5db3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_VMNOTIFY) += vmnotify.o
diff --git a/mm/vmnotify.c b/mm/vmnotify.c
new file mode 100644
index 0000000..6800450
--- /dev/null
+++ b/mm/vmnotify.c
@@ -0,0 +1,235 @@
+#include <linux/anon_inodes.h>
+#include <linux/vmnotify.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#define VMNOTIFY_MAX_FREE_THRESHOD	100
+
+struct vmnotify_watch {
+	struct vmnotify_config		config;
+
+	struct mutex			mutex;
+	bool				pending;
+	struct vmnotify_event		event;
+
+	/* sampling */
+	struct hrtimer			timer;
+
+	/* poll */
+	wait_queue_head_t		waitq;
+};
+
+static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
+{
+	if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+		u64 threshold;
+
+		if (!event->nr_avail_pages)
+			return false;
+
+		threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
+		if (threshold > watch->config.free_threshold)
+			return false;
+	}
+
+	return true;
+}
+
+static void vmnotify_sample(struct vmnotify_watch *watch)
+{
+	struct vmnotify_event event;
+	struct sysinfo si;
+
+	memset(&event, 0, sizeof(event));
+
+	event.size		= sizeof(event);
+	event.nr_free_pages	= global_page_state(NR_FREE_PAGES);
+
+	si_meminfo(&si);
+	event.nr_avail_pages	= si.totalram;
+
+#ifdef CONFIG_SWAP
+	si_swapinfo(&si);
+	event.nr_swap_pages	= si.totalswap;
+#endif
+
+	if (!vmnotify_match(watch, &event))
+		return;
+
+	mutex_lock(&watch->mutex);
+
+	watch->pending = true;
+
+	memcpy(&watch->event, &event, sizeof(event));
+
+	mutex_unlock(&watch->mutex);
+}
+
+static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
+{
+	struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
+	u64 sample_period = watch->config.sample_period_ns;
+
+	vmnotify_sample(watch);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+	wake_up(&watch->waitq);
+
+	return HRTIMER_RESTART;
+}
+
+static void vmnotify_start_timer(struct vmnotify_watch *watch)
+{
+	u64 sample_period = watch->config.sample_period_ns;
+
+	hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	watch->timer.function = vmnotify_timer_fn;
+
+	hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
+}
+
+static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
+{
+	struct vmnotify_watch *watch = file->private_data;
+	unsigned int events = 0;
+
+	poll_wait(file, &watch->waitq, wait);
+
+	mutex_lock(&watch->mutex);
+
+	if (watch->pending)
+		events |= POLLIN;
+
+	mutex_unlock(&watch->mutex);
+
+	return events;
+}
+
+static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct vmnotify_watch *watch = file->private_data;
+	int ret = 0;
+
+	mutex_lock(&watch->mutex);
+
+	if (!watch->pending)
+		goto out_unlock;
+
+	if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	ret = watch->event.size;
+
+	watch->pending = false;
+
+out_unlock:
+	mutex_unlock(&watch->mutex);
+
+	return ret;
+}
+
+static int vmnotify_release(struct inode *inode, struct file *file)
+{
+	struct vmnotify_watch *watch = file->private_data;
+
+	hrtimer_cancel(&watch->timer);
+
+	kfree(watch);
+
+	return 0;
+}
+
+static const struct file_operations vmnotify_fops = {
+	.poll		= vmnotify_poll,
+	.read		= vmnotify_read,
+	.release	= vmnotify_release,
+};
+
+static struct vmnotify_watch *vmnotify_watch_alloc(void)
+{
+	struct vmnotify_watch *watch;
+
+	watch = kzalloc(sizeof *watch, GFP_KERNEL);
+	if (!watch)
+		return NULL;
+
+	mutex_init(&watch->mutex);
+
+	init_waitqueue_head(&watch->waitq);
+
+	return watch;
+}
+
+static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
+				struct vmnotify_config *config)
+{
+	int ret;
+
+	ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
+	if (ret)
+		return -EFAULT;
+
+	if (!config->type)
+		return -EINVAL;
+
+	if (config->type & VMNOTIFY_TYPE_SAMPLE) {
+		if (config->sample_period_ns < NSEC_PER_MSEC)
+			return -EINVAL;
+	}
+
+	if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
+		if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+SYSCALL_DEFINE1(vmnotify_fd,
+		struct vmnotify_config __user *, uconfig)
+{
+	struct vmnotify_watch *watch;
+	struct file *file;
+	int err;
+	int fd;
+
+	watch = vmnotify_watch_alloc();
+	if (!watch)
+		return -ENOMEM;
+
+	err = vmnotify_copy_config(uconfig, &watch->config);
+	if (err)
+		goto err_free;
+
+	fd = get_unused_fd_flags(O_RDONLY);
+	if (fd < 0) {
+		err = fd;
+		goto err_free;
+	}
+
+	file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_fd;
+	}
+
+	fd_install(fd, file);
+
+	if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
+		vmnotify_start_timer(watch);
+
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_free:
+	kfree(watch);
+	return err;
+}
diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
new file mode 100644
index 0000000..3c6b26d
--- /dev/null
+++ b/tools/testing/vmnotify/vmnotify-test.c
@@ -0,0 +1,68 @@
+#include "../../../include/linux/vmnotify.h"
+
+#if defined(__x86_64__)
+#include "../../../arch/x86/include/asm/unistd.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <poll.h>
+
+static int sys_vmnotify_fd(struct vmnotify_config *config)
+{
+	config->size = sizeof(*config);
+
+	return syscall(__NR_vmnotify_fd, config);
+}
+
+int main(int argc, char *argv[])
+{
+	struct vmnotify_config config;
+	struct vmnotify_event event;
+	struct pollfd pollfd;
+	int i;
+	int fd;
+
+	config = (struct vmnotify_config) {
+		.type			= VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
+		.sample_period_ns	= 1000000000L,
+		.free_threshold		= 99,
+	};
+
+	fd = sys_vmnotify_fd(&config);
+	if (fd < 0) {
+		perror("vmnotify_fd failed");
+		exit(1);
+	}
+
+	for (i = 0; i < 10; i++) {
+		pollfd.fd		= fd;
+		pollfd.events		= POLLIN;
+
+		if (poll(&pollfd, 1, -1) < 0) {
+			perror("poll failed");
+			exit(1);
+		}
+
+		memset(&event, 0, sizeof(event));
+
+		if (read(fd, &event, sizeof(event)) < 0) {
+			perror("read failed");
+			exit(1);
+		}
+
+		printf("VM event:\n");
+		printf("\tsize=%lu\n", event.size);
+		printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
+		printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
+		printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
+	}
+	if (close(fd) < 0) {
+		perror("close failed");
+		exit(1);
+	}
+
+	return 0;
+}
--
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]