per cpu pages structure, aka pcp, has high and batch values to control how many pages we perform caching. This values could be updated asynchronously and updater should ensure that this doesn't make any problem. For this purpose, pageset_update() is implemented and do some memory synchronization. But, it turns out to be wrong when I implemented new feature using this. There is no corresponding smp_rmb() in read-side so that it can't guarantee anything. Without correct updating, system could hang in free_pcppages_bulk() due to larger batch value than high. To properly update this values, we need to synchronization primitives on read-side, but, it hurts allocator's fastpath. There is another choice for synchronization, that is, sending IPI. This is somewhat expensive, but, this is really rare case so I guess it has no problem here. However, reducing IPI is very helpful here. Current logic handles each CPU's pcp update one by one. To reduce sending IPI, we need to re-ogranize the code to handle all CPU's pcp update at one go. This patch implement these requirements. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> --- mm/page_alloc.c | 139 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 80 insertions(+), 59 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6fee4b..3e1e344 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3800,7 +3800,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static void setup_pageset(struct per_cpu_pageset __percpu *pcp); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static void setup_zone_pageset(struct zone *zone); @@ -3846,9 +3846,9 @@ static int __build_all_zonelists(void *data) * needs the percpu allocator in order to allocate its pagesets * (a chicken-egg dilemma). */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + setup_pageset(&boot_pageset); + for_each_possible_cpu(cpu) { #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * We now know the "local memory node" for each node-- @@ -4230,24 +4230,59 @@ static int zone_batchsize(struct zone *zone) * outside of boot time (or some other assurance that no concurrent updaters * exist). */ -static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, - unsigned long batch) +static void pageset_update(struct zone *zone, int high, int batch) { - /* start with a fail safe value for batch */ - pcp->batch = 1; - smp_wmb(); + int cpu; + struct per_cpu_pages *pcp; + + /* start with a fail safe value for batch */ + for_each_possible_cpu(cpu) { + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp; + pcp->batch = 1; + } + kick_all_cpus_sync(); + + /* Update high, then batch, in order */ + for_each_possible_cpu(cpu) { + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp; + pcp->high = high; + } + kick_all_cpus_sync(); - /* Update high, then batch, in order */ - pcp->high = high; - smp_wmb(); + for_each_possible_cpu(cpu) { + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp; + pcp->batch = batch; + } +} + +/* + * pageset_get_values_by_high() gets the high water mark for + * hot per_cpu_pagelist to the value high for the pageset p. + */ +static void pageset_get_values_by_high(int input_high, + int *output_high, int *output_batch) +{ + *output_batch = max(1, input_high / 4); + if ((input_high / 4) > (PAGE_SHIFT * 8)) + *output_batch = PAGE_SHIFT * 8; +} - pcp->batch = batch; +/* a companion to pageset_get_values_by_high() */ +static void pageset_get_values_by_batch(int input_batch, + int *output_high, int *output_batch) +{ + *output_high = 6 * input_batch; + *output_batch = max(1, 1 * input_batch); } -/* a companion to pageset_set_high() */ -static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +static void pageset_get_values(struct zone *zone, int *high, int *batch) { - pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); + if (percpu_pagelist_fraction) { + pageset_get_values_by_high( + (zone->managed_pages / percpu_pagelist_fraction), + high, batch); + } else + pageset_get_values_by_batch(zone_batchsize(zone), high, batch); } static void pageset_init(struct per_cpu_pageset *p) @@ -4263,51 +4298,38 @@ static void pageset_init(struct per_cpu_pageset *p) INIT_LIST_HEAD(&pcp->lists[migratetype]); } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* Use this only in boot time, because it doesn't do any synchronization */ +static void setup_pageset(struct per_cpu_pageset __percpu *pcp) { - pageset_init(p); - pageset_set_batch(p, batch); -} - -/* - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist - * to the value high for the pageset p. - */ -static void pageset_set_high(struct per_cpu_pageset *p, - unsigned long high) -{ - unsigned long batch = max(1UL, high / 4); - if ((high / 4) > (PAGE_SHIFT * 8)) - batch = PAGE_SHIFT * 8; - - pageset_update(&p->pcp, high, batch); -} - -static void pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) -{ - if (percpu_pagelist_fraction) - pageset_set_high(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - else - pageset_set_batch(pcp, zone_batchsize(zone)); -} + int cpu; + int high, batch; + struct per_cpu_pageset *p; -static void __meminit zone_pageset_init(struct zone *zone, int cpu) -{ - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + pageset_get_values_by_batch(0, &high, &batch); - pageset_init(pcp); - pageset_set_high_and_batch(zone, pcp); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(pcp, cpu); + pageset_init(p); + p->pcp.high = high; + p->pcp.batch = batch; + } } static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; + int high, batch; + struct per_cpu_pageset *p; + + pageset_get_values(zone, &high, &batch); + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_init(p); + p->pcp.high = high; + p->pcp.batch = batch; + } } /* @@ -5928,11 +5950,10 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, goto out; for_each_populated_zone(zone) { - unsigned int cpu; + int high, batch; - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); + pageset_get_values(zone, &high, &batch); + pageset_update(zone, high, batch); } out: mutex_unlock(&pcp_batch_high_lock); @@ -6455,11 +6476,11 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) */ void __meminit zone_pcp_update(struct zone *zone) { - unsigned cpu; + int high, batch; + mutex_lock(&pcp_batch_high_lock); - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); + pageset_get_values(zone, &high, &batch); + pageset_update(zone, high, batch); mutex_unlock(&pcp_batch_high_lock); } #endif -- 1.7.9.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>