Hi Balbir, On Tue, Jan 25, 2011 at 2:10 PM, Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> wrote: > Changelog v4 > 1. Add max_unmapped_ratio and use that as the upper limit > to check when to shrink the unmapped page cache (Christoph > Lameter) > > Changelog v2 > 1. Use a config option to enable the code (Andrew Morton) > 2. Explain the magic tunables in the code or at-least attempt > to explain them (General comment) > 3. Hint uses of the boot parameter with unlikely (Andrew Morton) > 4. Use better names (balanced is not a good naming convention) > > Provide control using zone_reclaim() and a boot parameter. The > code reuses functionality from zone_reclaim() to isolate unmapped > pages and reclaim them as a priority, ahead of other mapped pages. > A new sysctl for max_unmapped_ratio is provided and set to 16, > indicating 16% of the total zone pages are unmapped, we start > shrinking unmapped page cache. > > Signed-off-by: Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> > --- > Documentation/kernel-parameters.txt | 8 +++ > include/linux/mmzone.h | 5 ++ > include/linux/swap.h | 23 ++++++++- > init/Kconfig | 12 +++++ > kernel/sysctl.c | 11 ++++ > mm/page_alloc.c | 25 ++++++++++ > mm/vmscan.c | 87 +++++++++++++++++++++++++++++++++++ > 7 files changed, 166 insertions(+), 5 deletions(-) > > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt > index fee5f57..65a4ee6 100644 > --- a/Documentation/kernel-parameters.txt > +++ b/Documentation/kernel-parameters.txt > @@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file > [X86] > Set unknown_nmi_panic=1 early on boot. > > + unmapped_page_control > + [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL > + is enabled. It controls the amount of unmapped memory > + that is present in the system. This boot option plus > + vm.min_unmapped_ratio (sysctl) provide granular control > + over how much unmapped page cache can exist in the system > + before kswapd starts reclaiming unmapped page cache pages. > + > usbcore.autosuspend= > [USB] The autosuspend time delay (in seconds) used > for newly-detected USB devices (default 2). This > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 2485acc..18f0f09 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -306,7 +306,10 @@ struct zone { > /* > * zone reclaim becomes active if more unmapped pages exist. > */ > +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA) > unsigned long min_unmapped_pages; > + unsigned long max_unmapped_pages; > +#endif > #ifdef CONFIG_NUMA > int node; > unsigned long min_slab_pages; > @@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, > void __user *, size_t *, loff_t *); > int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, > void __user *, size_t *, loff_t *); > +int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int, > + void __user *, size_t *, loff_t *); > int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, > void __user *, size_t *, loff_t *); > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 7b75626..ae62a03 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -255,19 +255,34 @@ extern int vm_swappiness; > extern int remove_mapping(struct address_space *mapping, struct page *page); > extern long vm_total_pages; > > +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA) > extern int sysctl_min_unmapped_ratio; > +extern int sysctl_max_unmapped_ratio; > + > extern int zone_reclaim(struct zone *, gfp_t, unsigned int); > -#ifdef CONFIG_NUMA > -extern int zone_reclaim_mode; > -extern int sysctl_min_slab_ratio; > #else > -#define zone_reclaim_mode 0 > static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) > { > return 0; > } > #endif > > +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) > +extern bool should_reclaim_unmapped_pages(struct zone *zone); > +#else > +static inline bool should_reclaim_unmapped_pages(struct zone *zone) > +{ > + return false; > +} > +#endif > + > +#ifdef CONFIG_NUMA > +extern int zone_reclaim_mode; > +extern int sysctl_min_slab_ratio; > +#else > +#define zone_reclaim_mode 0 > +#endif > + > extern int page_evictable(struct page *page, struct vm_area_struct *vma); > extern void scan_mapping_unevictable_pages(struct address_space *); > > diff --git a/init/Kconfig b/init/Kconfig > index 4f6cdbf..2dfbc09 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -828,6 +828,18 @@ config SCHED_AUTOGROUP > config MM_OWNER > bool > > +config UNMAPPED_PAGECACHE_CONTROL > + bool "Provide control over unmapped page cache" > + default n > + help > + This option adds support for controlling unmapped page cache > + via a boot parameter (unmapped_page_control). The boot parameter > + with sysctl (vm.min_unmapped_ratio) control the total number > + of unmapped pages in the system. This feature is useful if > + you want to limit the amount of unmapped page cache or want > + to reduce page cache duplication in a virtualized environment. > + If unsure say 'N' > + > config SYSFS_DEPRECATED > bool "enable deprecated sysfs features to support old userspace tools" > depends on SYSFS > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 12e8f26..63dbba6 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = { > .extra1 = &zero, > }, > #endif > +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA) > { > .procname = "min_unmapped_ratio", > .data = &sysctl_min_unmapped_ratio, > @@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = { > .extra1 = &zero, > .extra2 = &one_hundred, > }, > + { > + .procname = "max_unmapped_ratio", > + .data = &sysctl_max_unmapped_ratio, > + .maxlen = sizeof(sysctl_max_unmapped_ratio), > + .mode = 0644, > + .proc_handler = sysctl_max_unmapped_ratio_sysctl_handler, > + .extra1 = &zero, > + .extra2 = &one_hundred, > + }, > +#endif > #ifdef CONFIG_NUMA > { > .procname = "zone_reclaim_mode", > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7b56473..2ac8549 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1660,6 +1660,9 @@ zonelist_scan: > unsigned long mark; > int ret; > > + if (should_reclaim_unmapped_pages(zone)) > + wakeup_kswapd(zone, order, classzone_idx); > + Do we really need the check in fastpath? There are lost of caller of alloc_pages. Many of them are not related to mapped pages. Could we move the check into add_to_page_cache_locked? > mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; > if (zone_watermark_ok(zone, order, mark, > classzone_idx, alloc_flags)) > @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, > > zone->spanned_pages = size; > zone->present_pages = realsize; > +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA) > zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) > / 100; > + zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio) > + / 100; > +#endif > #ifdef CONFIG_NUMA > zone->node = nid; > zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; > @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, > return 0; > } > > +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA) > int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, > void __user *buffer, size_t *length, loff_t *ppos) > { > @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, > return 0; > } > > +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write, > + void __user *buffer, size_t *length, loff_t *ppos) > +{ > + struct zone *zone; > + int rc; > + > + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); > + if (rc) > + return rc; > + > + for_each_zone(zone) > + zone->max_unmapped_pages = (zone->present_pages * > + sysctl_max_unmapped_ratio) / 100; > + return 0; > +} > +#endif > + > #ifdef CONFIG_NUMA > int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, > void __user *buffer, size_t *length, loff_t *ppos) > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 02cc82e..6377411 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem); > #define scanning_global_lru(sc) (1) > #endif > > +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) > +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone, > + struct scan_control *sc); > +static int unmapped_page_control __read_mostly; > + > +static int __init unmapped_page_control_parm(char *str) > +{ > + unmapped_page_control = 1; > + /* > + * XXX: Should we tweak swappiness here? > + */ > + return 1; > +} > +__setup("unmapped_page_control", unmapped_page_control_parm); > + > +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */ > +static inline unsigned long reclaim_unmapped_pages(int priority, > + struct zone *zone, struct scan_control *sc) > +{ > + return 0; > +} > +#endif > + > static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, > struct scan_control *sc) > { > @@ -2359,6 +2382,12 @@ loop_again: > shrink_active_list(SWAP_CLUSTER_MAX, zone, > &sc, priority, 0); > > + /* > + * We do unmapped page reclaim once here and once > + * below, so that we don't lose out > + */ > + reclaim_unmapped_pages(priority, zone, &sc); > + > if (!zone_watermark_ok_safe(zone, order, > high_wmark_pages(zone), 0, 0)) { > end_zone = i; > @@ -2396,6 +2425,11 @@ loop_again: > continue; > > sc.nr_scanned = 0; > + /* > + * Reclaim unmapped pages upfront, this should be > + * really cheap > + */ > + reclaim_unmapped_pages(priority, zone, &sc); Why should we do by two phase? It's not a direct reclaim path. I mean it doesn't need to reclaim tighly If we can't reclaim enough, next allocation would wake up kswapd again and kswapd try it again. And I have a concern. I already pointed out. If memory pressure is heavy and unmappd_pages is more than our threshold, this can move inactive's tail pages which are mapped into heads by reclaim_unmapped_pages. It can make confusing LRU order so working set can be evicted. zone_reclaim is used by only NUMA until now but you are opening it in the world. I think it would be a good feature in embedded system, too. I hope we care of working set eviction problem. -- Kind regards, Minchan Kim -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html