The patch titled readahead: state based method - aging accounting has been added to the -mm tree. Its filename is readahead-state-based-method-aging-accounting.patch See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find out what to do about this ------------------------------------------------------ Subject: readahead: state based method - aging accounting From: Wu Fengguang <wfg@xxxxxxxxxxxxxxxx> Collect info about the global available memory and its consumption speed. The data are used by the stateful method to estimate the thrashing threshold. They are the decisive factor of the correctness/accuracy of the resulting read-ahead size. - On NUMA systems, the accountings are done on a per-node basis. It works for the two common real-world schemes: - the reader process allocates caches in a node affined manner; - the reader process allocates caches _balancely_ from a set of nodes. - On non-NUMA systems, the readahead_aging is mainly increased on first access of the read-ahead pages, in order to make it go up constantly and smoothly. It helps improve the accuracy on small/fast read-aheads, with the cost of a little more overhead. Signed-off-by: Wu Fengguang <wfg@xxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- include/linux/mm.h | 9 +++++++ include/linux/mmzone.h | 5 +++ mm/Kconfig | 5 +++ mm/readahead.c | 49 +++++++++++++++++++++++++++++++++++++++ mm/swap.c | 2 + mm/vmscan.c | 4 +++ 6 files changed, 74 insertions(+) diff -puN include/linux/mm.h~readahead-state-based-method-aging-accounting include/linux/mm.h --- 25/include/linux/mm.h~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/include/linux/mm.h Fri May 26 13:34:08 2006 @@ -1003,6 +1003,15 @@ static inline int prefer_adaptive_readah return readahead_ratio >= 10; } +DECLARE_PER_CPU(unsigned long, readahead_aging); +static inline void inc_readahead_aging(void) +{ +#ifdef CONFIG_READAHEAD_SMOOTH_AGING + if (prefer_adaptive_readahead()) + per_cpu(readahead_aging, raw_smp_processor_id())++; +#endif +} + /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); #ifdef CONFIG_IA64 diff -puN include/linux/mmzone.h~readahead-state-based-method-aging-accounting include/linux/mmzone.h --- 25/include/linux/mmzone.h~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/include/linux/mmzone.h Fri May 26 13:34:08 2006 @@ -161,6 +161,11 @@ struct zone { unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ + /* The accumulated number of activities that may cause page aging, + * that is, make some pages closer to the tail of inactive_list. + */ + unsigned long aging_total; + /* A count of how many reclaimers are scanning this zone */ atomic_t reclaim_in_progress; diff -puN mm/Kconfig~readahead-state-based-method-aging-accounting mm/Kconfig --- 25/mm/Kconfig~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/mm/Kconfig Fri May 26 13:34:08 2006 @@ -216,3 +216,8 @@ config UNALIGNED_ZONE_BOUNDARIES echo 1 > /debug/readahead/debug_level # stop filling my kern.log Say N for production servers. + +config READAHEAD_SMOOTH_AGING + def_bool n if NUMA + default y if !NUMA + depends on ADAPTIVE_READAHEAD diff -puN mm/readahead.c~readahead-state-based-method-aging-accounting mm/readahead.c --- 25/mm/readahead.c~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/mm/readahead.c Fri May 26 13:34:08 2006 @@ -16,6 +16,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <asm/div64.h> /* * Convienent macros for min/max read-ahead pages. @@ -43,6 +44,14 @@ EXPORT_SYMBOL_GPL(readahead_ratio); int readahead_hit_rate = 1; /* + * Measures the aging process of cold pages. + * Mainly increased on fresh page references to make it smooth. + */ +#ifdef CONFIG_READAHEAD_SMOOTH_AGING +DEFINE_PER_CPU(unsigned long, readahead_aging); +#endif + +/* * Detailed classification of read-ahead behaviors. */ #define RA_CLASS_SHIFT 4 @@ -783,6 +792,46 @@ out: } /* + * The node's effective length of inactive_list(s). + */ +static unsigned long node_free_and_cold_pages(void) +{ + unsigned int i; + unsigned long sum = 0; + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += zones[i].nr_inactive + + zones[i].free_pages - zones[i].pages_low; + + return sum; +} + +/* + * The node's accumulated aging activities. + */ +static unsigned long node_readahead_aging(void) +{ + unsigned long sum = 0; + +#ifdef CONFIG_READAHEAD_SMOOTH_AGING + unsigned long cpu; + cpumask_t mask = node_to_cpumask(numa_node_id()); + + for_each_cpu_mask(cpu, mask) + sum += per_cpu(readahead_aging, cpu); +#else + unsigned int i; + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += zones[i].aging_total; +#endif + + return sum; +} + +/* * ra_min is mainly determined by the size of cache memory. Reasonable? * * Table of concrete numbers for 4KB page size: diff -puN mm/swap.c~readahead-state-based-method-aging-accounting mm/swap.c --- 25/mm/swap.c~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/mm/swap.c Fri May 26 13:34:08 2006 @@ -127,6 +127,8 @@ void fastcall mark_page_accessed(struct ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); + if (PageLRU(page)) + inc_readahead_aging(); } } diff -puN mm/vmscan.c~readahead-state-based-method-aging-accounting mm/vmscan.c --- 25/mm/vmscan.c~readahead-state-based-method-aging-accounting Fri May 26 13:34:08 2006 +++ 25-akpm/mm/vmscan.c Fri May 26 13:34:08 2006 @@ -458,6 +458,9 @@ static unsigned long shrink_page_list(st if (PageWriteback(page)) goto keep_locked; + if (!PageReferenced(page)) + inc_readahead_aging(); + referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) @@ -656,6 +659,7 @@ static unsigned long shrink_inactive_lis &page_list, &nr_scan); zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_scan; + zone->aging_total += nr_scan; spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; _ Patches currently in -mm which might be from wfg@xxxxxxxxxxxxxxxx are readahead-kconfig-options.patch radixtree-introduce-__radix_tree_lookup_parent.patch radixtree-introduce-radix_tree_scan_hole.patch mm-introduce-probe_pages.patch mm-introduce-pg_readahead.patch readahead-add-look-ahead-support-to-__do_page_cache_readahead.patch readahead-delay-page-release-in-do_generic_mapping_read.patch readahead-insert-cond_resched-calls.patch readahead-minmax_ra_pages.patch readahead-events-accounting.patch readahead-rescue_pages.patch readahead-sysctl-parameters.patch readahead-min-max-sizes.patch readahead-state-based-method-aging-accounting.patch readahead-state-based-method-routines.patch readahead-state-based-method.patch readahead-context-based-method.patch readahead-initial-method-guiding-sizes.patch readahead-initial-method-thrashing-guard-size.patch readahead-initial-method-expected-read-size.patch readahead-initial-method-user-recommended-size.patch readahead-initial-method.patch readahead-backward-prefetching-method.patch readahead-seeking-reads-method.patch readahead-thrashing-recovery-method.patch readahead-call-scheme.patch readahead-laptop-mode.patch readahead-loop-case.patch readahead-nfsd-case.patch readahead-turn-on-by-default.patch readahead-debug-radix-tree-new-functions.patch readahead-debug-traces-showing-accessed-file-names.patch readahead-debug-traces-showing-read-patterns.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html