the core of this patch series. add /dev/mem_notify device for notification low memory to user process. <usage examle> fd = open("/dev/mem_notify", O_RDONLY); if (fd < 0) { exit(1); } pollfds.fd = fd; pollfds.events = POLLIN; pollfds.revents = 0; err = poll(&pollfds, 1, -1); // wake up at low memory ... </usage example> ChangeLog v5 -> v6: o improve number of wakeup tasks fomula when task is a few. Signed-off-by: Marcelo Tosatti <marcelo@xxxxxxxxx> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> --- Documentation/devices.txt | 1 drivers/char/mem.c | 5 + include/linux/mem_notify.h | 42 +++++++++++++++ include/linux/mmzone.h | 1 mm/Makefile | 2 mm/mem_notify.c | 123 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 1 7 files changed, 174 insertions(+), 1 deletion(-) Index: b/drivers/char/mem.c =================================================================== --- a/drivers/char/mem.c 2008-02-03 20:59:43.000000000 +0900 +++ b/drivers/char/mem.c 2008-02-03 21:00:24.000000000 +0900 @@ -26,6 +26,7 @@ #include <linux/bootmem.h> #include <linux/splice.h> #include <linux/pfn.h> +#include <linux/mem_notify.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -869,6 +870,9 @@ static int memory_open(struct inode * in filp->f_op = &oldmem_fops; break; #endif + case 13: + filp->f_op = &mem_notify_fops; + break; default: return -ENXIO; } @@ -901,6 +905,7 @@ static const struct { #ifdef CONFIG_CRASH_DUMP {12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops}, #endif + {13, "mem_notify", S_IRUGO, &mem_notify_fops}, }; static struct class *mem_class; Index: b/include/linux/mem_notify.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/include/linux/mem_notify.h 2008-02-03 21:01:41.000000000 +0900 @@ -0,0 +1,42 @@ +/* + * Notify applications of memory pressure via /dev/mem_notify + * + * Copyright (C) 2008 Marcelo Tosatti <marcelo@xxxxxxxxx>, + * KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> + * + * Released under the GPL, see the file COPYING for details. + */ + +#ifndef _LINUX_MEM_NOTIFY_H +#define _LINUX_MEM_NOTIFY_H + +#define MEM_NOTIFY_FREQ (HZ/5) + +extern atomic_long_t last_mem_notify; +extern struct file_operations mem_notify_fops; + +extern void __memory_pressure_notify(struct zone *zone, int pressure); + +static inline void memory_pressure_notify(struct zone *zone, int pressure) +{ + unsigned long target; + unsigned long pages_high, pages_free, pages_reserve; + + if (pressure) { + target = atomic_long_read(&last_mem_notify) + MEM_NOTIFY_FREQ; + if (likely(time_before(jiffies, target))) + return; + + pages_high = zone->pages_high; + pages_free = zone_page_state(zone, NR_FREE_PAGES); + pages_reserve = zone->lowmem_reserve[MAX_NR_ZONES-1]; + if (unlikely(pages_free > (pages_high+pages_reserve)*2)) + return; + + } else if (likely(!zone->mem_notify_status)) + return; + + __memory_pressure_notify(zone, pressure); +} + +#endif /* _LINUX_MEM_NOTIFY_H */ Index: b/include/linux/mmzone.h =================================================================== --- a/include/linux/mmzone.h 2008-02-03 20:59:43.000000000 +0900 +++ b/include/linux/mmzone.h 2008-02-03 20:59:46.000000000 +0900 @@ -283,6 +283,7 @@ struct zone { */ int prev_priority; + int mem_notify_status; ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ Index: b/mm/Makefile =================================================================== --- a/mm/Makefile 2008-02-03 20:59:43.000000000 +0900 +++ b/mm/Makefile 2008-02-03 20:59:46.000000000 +0900 @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o mem_notify.o $(mmu-y) obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o Index: b/mm/mem_notify.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/mm/mem_notify.c 2008-02-03 21:02:30.000000000 +0900 @@ -0,0 +1,123 @@ +/* + * Notify applications of memory pressure via /dev/mem_notify + * + * Copyright (C) 2008 Marcelo Tosatti <marcelo@xxxxxxxxx>, + * KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> + * + * Released under the GPL, see the file COPYING for details. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/timer.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/percpu.h> +#include <linux/timer.h> +#include <linux/mem_notify.h> + +#include <asm/atomic.h> + +#define MAX_PROC_WAKEUP_GUARD (10*HZ) +#define MAX_WAKEUP_TASKS (100) + +struct mem_notify_file_info { + unsigned long last_proc_notify; +}; + +static DECLARE_WAIT_QUEUE_HEAD(mem_wait); +static atomic_long_t nr_under_memory_pressure_zones = ATOMIC_LONG_INIT(0); +static atomic_t nr_watcher_task = ATOMIC_INIT(0); + +atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES); + +void __memory_pressure_notify(struct zone *zone, int pressure) +{ + int nr_wakeup; + int flags; + + spin_lock_irqsave(&mem_wait.lock, flags); + + if (pressure != zone->mem_notify_status) { + long val = pressure ? 1 : -1; + atomic_long_add(val, &nr_under_memory_pressure_zones); + zone->mem_notify_status = pressure; + } + + if (pressure) { + int nr_watcher = atomic_read(&nr_watcher_task); + + atomic_long_set(&last_mem_notify, jiffies); + if (!nr_watcher) + goto out; + + nr_wakeup = (nr_watcher >> 4) + 1; + if (unlikely(nr_wakeup > MAX_WAKEUP_TASKS)) + nr_wakeup = MAX_WAKEUP_TASKS; + + wake_up_locked_nr(&mem_wait, nr_wakeup); + } +out: + spin_unlock_irqrestore(&mem_wait.lock, flags); +} + +static int mem_notify_open(struct inode *inode, struct file *file) +{ + struct mem_notify_file_info *info; + int err = 0; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto out; + } + + info->last_proc_notify = INITIAL_JIFFIES; + file->private_data = info; + atomic_inc(&nr_watcher_task); +out: + return err; +} + +static int mem_notify_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + atomic_dec(&nr_watcher_task); + return 0; +} + +static unsigned int mem_notify_poll(struct file *file, poll_table *wait) +{ + struct mem_notify_file_info *info = file->private_data; + unsigned long now = jiffies; + unsigned long timeout; + unsigned int retval = 0; + unsigned long guard_time; + + poll_wait_exclusive(file, &mem_wait, wait); + + guard_time = min_t(unsigned long, + MEM_NOTIFY_FREQ * atomic_read(&nr_watcher_task), + MAX_PROC_WAKEUP_GUARD); + timeout = info->last_proc_notify + guard_time; + if (time_before(now, timeout)) + goto out; + + if (atomic_long_read(&nr_under_memory_pressure_zones) != 0) { + info->last_proc_notify = now; + retval = POLLIN; + } + +out: + return retval; +} + +struct file_operations mem_notify_fops = { + .open = mem_notify_open, + .release = mem_notify_release, + .poll = mem_notify_poll, +}; +EXPORT_SYMBOL(mem_notify_fops); Index: b/mm/page_alloc.c =================================================================== --- a/mm/page_alloc.c 2008-02-03 20:59:43.000000000 +0900 +++ b/mm/page_alloc.c 2008-02-03 21:01:43.000000000 +0900 @@ -3458,6 +3458,7 @@ static void __meminit free_area_init_cor zone->zone_pgdat = pgdat; zone->prev_priority = DEF_PRIORITY; + zone->mem_notify_status = 0; zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); Index: b/Documentation/devices.txt =================================================================== --- a/Documentation/devices.txt 2008-02-03 20:59:43.000000000 +0900 +++ b/Documentation/devices.txt 2008-02-03 20:59:46.000000000 +0900 @@ -96,6 +96,7 @@ Your cooperation is appreciated. 11 = /dev/kmsg Writes to this come out as printk's 12 = /dev/oldmem Used by crashdump kernels to access the memory of the kernel that crashed. + 13 = /dev/mem_notify Low memory notification. 1 block RAM disk 0 = /dev/ram0 First RAM disk - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html