>From 36c6f4d293469569ca8b53b89ab8eebd358a5fa5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin <guro@xxxxxx> Date: Mon, 3 Jul 2017 19:02:49 +0100 Subject: [v2] mm: make allocation counters per-order High-order allocations are obviously more costly, and it's very useful to know how many of them happens, if there are any issues (or suspicions) with memory fragmentation. This commit changes existing per-zone allocation counters to be per-zone per-order. These counters are displayed using a new procfs interface (similar to /proc/buddyinfo): $ cat /proc/allocinfo DMA 0 0 0 0 0 \ 0 0 0 0 0 0 DMA32 3 0 1 0 0 \ 0 0 0 0 0 0 Normal 4997056 23594 10902 23686 931 \ 23 122 786 17 1 0 Movable 0 0 0 0 0 \ 0 0 0 0 0 0 Device 0 0 0 0 0 \ 0 0 0 0 0 0 The existing vmstat interface remains untouched*, and still shows the total number of single page allocations, so high-order allocations are represented as a corresponding number of order-0 allocations. $ cat /proc/vmstat | grep alloc pgalloc_dma 0 pgalloc_dma32 7 pgalloc_normal 5461660 pgalloc_movable 0 pgalloc_device 0 * I've added device zone for consistency with other zones, and to avoid messy exclusion of this zone in the code. v2: The functionality can be enabled/disabled by the PER_ORDER_ALLOC_COUNTERS config option. Signed-off-by: Roman Gushchin <guro@xxxxxx> Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Debabrata Banerjee <dbavatar@xxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: kernel-team@xxxxxx Cc: linux-mm@xxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx --- arch/s390/appldata/appldata_mem.c | 16 +++++ include/linux/mmzone.h | 2 + include/linux/vm_event_item.h | 27 ++++++-- include/linux/vmstat.h | 20 ++++++ init/Kconfig | 9 +++ mm/page_alloc.c | 11 +++- mm/vmstat.c | 128 +++++++++++++++++++++++++++++++++++--- 7 files changed, 199 insertions(+), 14 deletions(-) diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index 598df57..79679d3 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -66,6 +66,21 @@ struct appldata_mem_data { } __packed; +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS +static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev) +{ + int order; + + for (order = 1; order < MAX_ORDER; ++order) { + pgalloc += ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order; + pgalloc += ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order; + } +} +#else +static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev) +{ +} +#endif /* * appldata_get_mem_data() @@ -92,6 +107,7 @@ static void appldata_get_mem_data(void *data) mem_data->pswpout = ev[PSWPOUT]; mem_data->pgalloc = ev[PGALLOC_NORMAL]; mem_data->pgalloc += ev[PGALLOC_DMA]; + sum_pgalloc_events(&mem_data->pgalloc, ev); mem_data->pgfault = ev[PGFAULT]; mem_data->pgmajfault = ev[PGMAJFAULT]; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc14b8b..406dfc4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -66,6 +66,8 @@ enum migratetype { /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern char * const migratetype_names[MIGRATE_TYPES]; +extern const char *zone_name(int idx); + #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 37e8d31..da94618 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -19,12 +19,31 @@ #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE +#ifdef CONFIG_ZONE_DEVICE +#define DEVICE_ZONE(xx) xx##__DEVICE, +#else +#define DEVICE_ZONE(xx) +#endif + +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx) + +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS +#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER) +#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1)) +#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL) +#else +#define PGALLOC_EVENTS_SIZE MAX_NR_ZONES +#define PGALLOC_EVENTS_CUT_SIZE 0 +#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL) +#endif enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - FOR_ALL_ZONES(PGALLOC), - FOR_ALL_ZONES(ALLOCSTALL), - FOR_ALL_ZONES(PGSCAN_SKIP), + FOR_ALL_ZONES(PGALLOC) +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS + __PGALLOC_LAST = PGALLOC_FIRST_ZONE + PGALLOC_EVENTS_SIZE - 1, +#endif + FOR_ALL_ZONES(ALLOCSTALL) + FOR_ALL_ZONES(PGSCAN_SKIP) PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index b3d85f3..bca96fc 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -103,6 +103,26 @@ static inline void vm_events_fold_cpu(int cpu) #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS +static inline void __count_alloc_event(enum zone_type zid, unsigned int order) +{ + enum vm_event_item item; + + if (unlikely(order >= MAX_ORDER)) { + WARN_ON_ONCE(1); + return; + } + + item = PGALLOC_FIRST_ZONE + order * MAX_NR_ZONES + zid; + __count_vm_events(item, 1); +} +#else +static inline void __count_alloc_event(enum zone_type zid, unsigned int order) +{ + __count_zid_vm_events(PGALLOC, zid, 1 << order); +} +#endif + /* * Zone and node-based page accounting with per cpu differentials. */ diff --git a/init/Kconfig b/init/Kconfig index 8514b25..164d6f0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1477,6 +1477,15 @@ config VM_EVENT_COUNTERS on EXPERT systems. /proc/vmstat will only show page counts if VM event counters are disabled. +config PER_ORDER_ALLOC_COUNTERS + bool "Per-order memory allocation counters" + depends on VM_EVENT_COUNTERS && PROC_FS + help + This option enables splitting per-zone allocation counters + into per-zone per-order counters. + Per-order counters are exported using the /proc/allocinfo + interface, and /proc/vmstat shows accumulated values. + config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80e4adb..e74b327 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -233,6 +233,13 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +const char *zone_name(int zid) +{ + if (zid < MAX_NR_ZONES) + return zone_names[zid]; + return NULL; +} + char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", @@ -2779,7 +2786,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_alloc_event(page_zonenum(page), order); zone_statistics(preferred_zone, zone); } local_irq_restore(flags); @@ -2827,7 +2834,7 @@ struct page *rmqueue(struct zone *preferred_zone, __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_alloc_event(page_zonenum(page), order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4441b..1d468ed 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -27,6 +27,7 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/mmzone.h> #include "internal.h" @@ -34,18 +35,18 @@ DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); -static void sum_vm_events(unsigned long *ret) +static void sum_vm_events(unsigned long *ret, int off, size_t nr_events) { int cpu; int i; - memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + memset(ret, 0, nr_events * sizeof(unsigned long)); for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - ret[i] += this->event[i]; + for (i = 0; i < nr_events; i++) + ret[i] += this->event[off + i]; } } @@ -57,7 +58,7 @@ static void sum_vm_events(unsigned long *ret) void all_vm_events(unsigned long *ret) { get_online_cpus(); - sum_vm_events(ret); + sum_vm_events(ret, 0, NR_VM_EVENT_ITEMS); put_online_cpus(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -915,8 +916,15 @@ int fragmentation_index(struct zone *zone, unsigned int order) #define TEXT_FOR_HIGHMEM(xx) #endif +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx) xx "_device", +#else +#define TEXT_FOR_DEVICE(xx) +#endif + #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", + TEXT_FOR_HIGHMEM(xx) xx "_movable", \ + TEXT_FOR_DEVICE(xx) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ @@ -1480,12 +1488,92 @@ enum writeback_stat_item { NR_VM_WRITEBACK_STAT_ITEMS, }; +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS +static void sum_alloc_events(unsigned long *v) +{ + int zid, order, index; + + for (zid = 0; zid < MAX_NR_ZONES; ++zid) { + for (order = 1; order < MAX_ORDER; order++) { + index = PGALLOC_FIRST_ZONE + zid; + v[index] += v[index + order * MAX_NR_ZONES] << order; + } + } +} + +static int allocinfo_show(struct seq_file *m, void *arg) +{ + unsigned long allocs[PGALLOC_EVENTS_SIZE]; + unsigned int order; + int zid; + + if (arg != SEQ_START_TOKEN) + return 0; + + get_online_cpus(); + sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE); + put_online_cpus(); + + for (zid = 0; zid < MAX_NR_ZONES; ++zid) { + seq_printf(m, "%8s ", zone_name(zid)); + + for (order = 0; order < MAX_ORDER; order++) + seq_printf(m, "%10lu ", + allocs[zid + order * MAX_NR_ZONES]); + + seq_putc(m, '\n'); + } + + return 0; +} + +static void *allocinfo_start(struct seq_file *m, loff_t *pos) +{ + if (*pos) + return NULL; + return SEQ_START_TOKEN; +} + +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void allocinfo_stop(struct seq_file *m, void *arg) +{ +} + +static const struct seq_operations allocinfo_op = { + .start = allocinfo_start, + .next = allocinfo_next, + .stop = allocinfo_stop, + .show = allocinfo_show, +}; + +static int allocinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &allocinfo_op); +} + +static const struct file_operations allocinfo_file_operations = { + .open = allocinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#else +static void sum_alloc_events(unsigned long *v) +{ +} +#endif + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; int i, stat_items_size; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + @@ -1513,6 +1601,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) #ifdef CONFIG_VM_EVENT_COUNTERS all_vm_events(v); + sum_alloc_events(v); v[PGPGIN] /= 2; /* sectors -> kbytes */ v[PGPGOUT] /= 2; #endif @@ -1521,8 +1610,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) { + int alloc_event_start = NR_VM_ZONE_STAT_ITEMS + + NR_VM_NODE_STAT_ITEMS + + NR_VM_WRITEBACK_STAT_ITEMS + + PGALLOC_FIRST_ZONE; + (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) + if (*pos == alloc_event_start + MAX_NR_ZONES) + *(pos) += PGALLOC_EVENTS_CUT_SIZE; + + if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE) return NULL; return (unsigned long *)m->private + *pos; } @@ -1531,6 +1628,18 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; + int alloc_event_start = NR_VM_ZONE_STAT_ITEMS + + NR_VM_NODE_STAT_ITEMS + + NR_VM_WRITEBACK_STAT_ITEMS + + PGALLOC_FIRST_ZONE; + + if (off >= alloc_event_start + PGALLOC_EVENTS_SIZE) + off -= PGALLOC_EVENTS_CUT_SIZE; + + if (unlikely(off >= sizeof(vmstat_text))) { + WARN_ON_ONCE(1); + return 0; + } seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); @@ -1790,6 +1899,9 @@ void __init init_mm_internals(void) #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); +#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS + proc_create("allocinfo", 0444, NULL, &allocinfo_file_operations); +#endif proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); proc_create("vmstat", 0444, NULL, &vmstat_file_operations); proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); -- 2.7.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>