On Wed, Oct 31, 2018 at 03:40:02PM +0000, Pasha Tatashin wrote: > > > On 10/17/18 7:54 PM, Alexander Duyck wrote: > > This patch introduces a new iterator for_each_free_mem_pfn_range_in_zone. > > > > This iterator will take care of making sure a given memory range provided > > is in fact contained within a zone. It takes are of all the bounds checking > > we were doing in deferred_grow_zone, and deferred_init_memmap. In addition > > it should help to speed up the search a bit by iterating until the end of a > > range is greater than the start of the zone pfn range, and will exit > > completely if the start is beyond the end of the zone. > > > > This patch adds yet another iterator called > > for_each_free_mem_range_in_zone_from and then uses it to support > > initializing and freeing pages in groups no larger than MAX_ORDER_NR_PAGES. > > By doing this we can greatly improve the cache locality of the pages while > > we do several loops over them in the init and freeing process. > > > > We are able to tighten the loops as a result since we only really need the > > checks for first_init_pfn in our first iteration and after that we can > > assume that all future values will be greater than this. So I have added a > > function called deferred_init_mem_pfn_range_in_zone that primes the > > iterators and if it fails we can just exit. > > > > On my x86_64 test system with 384GB of memory per node I saw a reduction in > > initialization time from 1.85s to 1.38s as a result of this patch. > > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx> [ ... ] > > --- > > include/linux/memblock.h | 58 +++++++++++++++ > > mm/memblock.c | 63 ++++++++++++++++ > > mm/page_alloc.c | 176 ++++++++++++++++++++++++++++++++-------------- > > 3 files changed, 242 insertions(+), 55 deletions(-) > > > > diff --git a/include/linux/memblock.h b/include/linux/memblock.h > > index aee299a6aa76..2ddd1bafdd03 100644 > > --- a/include/linux/memblock.h > > +++ b/include/linux/memblock.h > > @@ -178,6 +178,25 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, > > p_start, p_end, p_nid)) > > > > /** > > + * for_each_mem_range_from - iterate through memblock areas from type_a and not > > + * included in type_b. Or just type_a if type_b is NULL. > > + * @i: u64 used as loop variable > > + * @type_a: ptr to memblock_type to iterate > > + * @type_b: ptr to memblock_type which excludes from the iteration > > + * @nid: node selector, %NUMA_NO_NODE for all nodes > > + * @flags: pick from blocks based on memory attributes > > + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL > > + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL > > + * @p_nid: ptr to int for nid of the range, can be %NULL > > + */ > > +#define for_each_mem_range_from(i, type_a, type_b, nid, flags, \ > > + p_start, p_end, p_nid) \ > > + for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ > > + p_start, p_end, p_nid); \ > > + i != (u64)ULLONG_MAX; \ > > + __next_mem_range(&i, nid, flags, type_a, type_b, \ > > + p_start, p_end, p_nid)) > > +/** > > * for_each_mem_range_rev - reverse iterate through memblock areas from > > * type_a and not included in type_b. Or just type_a if type_b is NULL. > > * @i: u64 used as loop variable > > @@ -248,6 +267,45 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, > > i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) > > #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ > > > > +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT Sorry for jumping late, but I've noticed this only now. Do the new iterators have to be restricted by CONFIG_DEFERRED_STRUCT_PAGE_INIT? > > +void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, > > + unsigned long *out_spfn, > > + unsigned long *out_epfn); > > +/** > > + * for_each_free_mem_range_in_zone - iterate through zone specific free > > + * memblock areas > > + * @i: u64 used as loop variable > > + * @zone: zone in which all of the memory blocks reside > > + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL > > + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL > > + * > > + * Walks over free (memory && !reserved) areas of memblock in a specific > > + * zone. Available as soon as memblock is initialized. > > + */ > > +#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ > > + for (i = 0, \ > > + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ > > + i != (u64)ULLONG_MAX; \ > > + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) > > + > > +/** > > + * for_each_free_mem_range_in_zone_from - iterate through zone specific > > + * free memblock areas from a given point > > + * @i: u64 used as loop variable > > + * @zone: zone in which all of the memory blocks reside > > + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL > > + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL > > + * > > + * Walks over free (memory && !reserved) areas of memblock in a specific > > + * zone, continuing from current position. Available as soon as memblock is > > + * initialized. > > + */ > > +#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ > > + for (; i != (u64)ULLONG_MAX; \ > > + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) > > + > > +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ > > + > > /** > > * for_each_free_mem_range - iterate through free memblock areas > > * @i: u64 used as loop variable > > diff --git a/mm/memblock.c b/mm/memblock.c > > index f2ef3915a356..ab3545e356b7 100644 > > --- a/mm/memblock.c > > +++ b/mm/memblock.c > > @@ -1239,6 +1239,69 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, > > return 0; > > } > > #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ > > +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT > > +/** > > + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() > > + * > > + * @idx: pointer to u64 loop variable > > + * @zone: zone in which all of the memory blocks reside > > + * @out_start: ptr to ulong for start pfn of the range, can be %NULL > > + * @out_end: ptr to ulong for end pfn of the range, can be %NULL > > + * > > + * This function is meant to be a zone/pfn specific wrapper for the > > + * for_each_mem_range type iterators. Specifically they are used in the > > + * deferred memory init routines and as such we were duplicating much of > > + * this logic throughout the code. So instead of having it in multiple > > + * locations it seemed like it would make more sense to centralize this to > > + * one new iterator that does everything they need. > > + */ > > +void __init_memblock > > +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, > > + unsigned long *out_spfn, unsigned long *out_epfn) > > +{ > > + int zone_nid = zone_to_nid(zone); > > + phys_addr_t spa, epa; > > + int nid; > > + > > + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, > > + &memblock.memory, &memblock.reserved, > > + &spa, &epa, &nid); > > + > > + while (*idx != ULLONG_MAX) { > > + unsigned long epfn = PFN_DOWN(epa); > > + unsigned long spfn = PFN_UP(spa); > > + > > + /* > > + * Verify the end is at least past the start of the zone and > > + * that we have at least one PFN to initialize. > > + */ > > + if (zone->zone_start_pfn < epfn && spfn < epfn) { > > + /* if we went too far just stop searching */ > > + if (zone_end_pfn(zone) <= spfn) > > + break; > > + > > + if (out_spfn) > > + *out_spfn = max(zone->zone_start_pfn, spfn); > > + if (out_epfn) > > + *out_epfn = min(zone_end_pfn(zone), epfn); > > + > > + return; > > + } > > + > > + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, > > + &memblock.memory, &memblock.reserved, > > + &spa, &epa, &nid); > > + } > > + > > + /* signal end of iteration */ > > + *idx = ULLONG_MAX; > > + if (out_spfn) > > + *out_spfn = ULONG_MAX; > > + if (out_epfn) > > + *out_epfn = 0; > > +} > > + > > +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ > > > > #ifdef CONFIG_HAVE_MEMBLOCK_PFN_VALID > > unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index a766a15fad81..20e9eb35d75d 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -1512,19 +1512,103 @@ static unsigned long __init deferred_init_pages(struct zone *zone, > > return (nr_pages); > > } > > > > +/* > > + * This function is meant to pre-load the iterator for the zone init. > > + * Specifically it walks through the ranges until we are caught up to the > > + * first_init_pfn value and exits there. If we never encounter the value we > > + * return false indicating there are no valid ranges left. > > + */ > > +static bool __init > > +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, > > + unsigned long *spfn, unsigned long *epfn, > > + unsigned long first_init_pfn) > > +{ > > + u64 j; > > + > > + /* > > + * Start out by walking through the ranges in this zone that have > > + * already been initialized. We don't need to do anything with them > > + * so we just need to flush them out of the system. > > + */ > > + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { > > + if (*epfn <= first_init_pfn) > > + continue; > > + if (*spfn < first_init_pfn) > > + *spfn = first_init_pfn; > > + *i = j; > > + return true; > > + } > > + > > + return false; > > +} > > + > > +/* > > + * Initialize and free pages. We do it in two loops: first we initialize > > + * struct page, than free to buddy allocator, because while we are > > + * freeing pages we can access pages that are ahead (computing buddy > > + * page in __free_one_page()). > > + * > > + * In order to try and keep some memory in the cache we have the loop > > + * broken along max page order boundaries. This way we will not cause > > + * any issues with the buddy page computation. > > + */ > > +static unsigned long __init > > +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, > > + unsigned long *end_pfn) > > +{ > > + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); > > + unsigned long spfn = *start_pfn, epfn = *end_pfn; > > + unsigned long nr_pages = 0; > > + u64 j = *i; > > + > > + /* First we loop through and initialize the page values */ > > + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { > > + unsigned long t; > > + > > + if (mo_pfn <= spfn) > > + break; > > + > > + t = min(mo_pfn, epfn); > > + nr_pages += deferred_init_pages(zone, spfn, t); > > + > > + if (mo_pfn <= epfn) > > + break; > > + } > > + > > + /* Reset values and now loop through freeing pages as needed */ > > + j = *i; > > + > > + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { > > + unsigned long t; > > + > > + if (mo_pfn <= *start_pfn) > > + break; > > + > > + t = min(mo_pfn, *end_pfn); > > + deferred_free_pages(*start_pfn, t); > > + *start_pfn = t; > > + > > + if (mo_pfn < *end_pfn) > > + break; > > + } > > + > > + /* Store our current values to be reused on the next iteration */ > > + *i = j; > > + > > + return nr_pages; > > +} > > + > > /* Initialise remaining memory on a node */ > > static int __init deferred_init_memmap(void *data) > > { > > pg_data_t *pgdat = data; > > - int nid = pgdat->node_id; > > + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); > > + unsigned long spfn = 0, epfn = 0, nr_pages = 0; > > + unsigned long first_init_pfn, flags; > > unsigned long start = jiffies; > > - unsigned long nr_pages = 0; > > - unsigned long spfn, epfn, first_init_pfn, flags; > > - phys_addr_t spa, epa; > > - int zid; > > struct zone *zone; > > - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); > > u64 i; > > + int zid; > > > > /* Bind memory initialisation thread to a local node if possible */ > > if (!cpumask_empty(cpumask)) > > @@ -1549,31 +1633,30 @@ static int __init deferred_init_memmap(void *data) > > if (first_init_pfn < zone_end_pfn(zone)) > > break; > > } > > - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); > > + > > + /* If the zone is empty somebody else may have cleared out the zone */ > > + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, > > + first_init_pfn)) { > > + pgdat_resize_unlock(pgdat, &flags); > > + pgdat_init_report_one_done(); > > + return 0; > > + } > > > > /* > > - * Initialize and free pages. We do it in two loops: first we initialize > > - * struct page, than free to buddy allocator, because while we are > > - * freeing pages we can access pages that are ahead (computing buddy > > - * page in __free_one_page()). > > + * Initialize and free pages in MAX_ORDER sized increments so > > + * that we can avoid introducing any issues with the buddy > > + * allocator. > > */ > > - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { > > - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); > > - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); > > - nr_pages += deferred_init_pages(zone, spfn, epfn); > > - } > > - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { > > - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); > > - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); > > - deferred_free_pages(spfn, epfn); > > - } > > + while (spfn < epfn) > > + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); > > + > > pgdat_resize_unlock(pgdat, &flags); > > > > /* Sanity check that the next zone really is unpopulated */ > > WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); > > > > - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, > > - jiffies_to_msecs(jiffies - start)); > > + pr_info("node %d initialised, %lu pages in %ums\n", > > + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); > > > > pgdat_init_report_one_done(); > > return 0; > > @@ -1604,14 +1687,11 @@ static int __init deferred_init_memmap(void *data) > > static noinline bool __init > > deferred_grow_zone(struct zone *zone, unsigned int order) > > { > > - int zid = zone_idx(zone); > > - int nid = zone_to_nid(zone); > > - pg_data_t *pgdat = NODE_DATA(nid); > > unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); > > - unsigned long nr_pages = 0; > > - unsigned long first_init_pfn, spfn, epfn, t, flags; > > + pg_data_t *pgdat = zone->zone_pgdat; > > unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; > > - phys_addr_t spa, epa; > > + unsigned long spfn, epfn, flags; > > + unsigned long nr_pages = 0; > > u64 i; > > > > /* Only the last zone may have deferred pages */ > > @@ -1640,37 +1720,23 @@ static int __init deferred_init_memmap(void *data) > > return true; > > } > > > > - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); > > - > > - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { > > + /* If the zone is empty somebody else may have cleared out the zone */ > > + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, > > + first_deferred_pfn)) { > > pgdat_resize_unlock(pgdat, &flags); > > - return false; > > + return true; > > } > > > > - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { > > - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); > > - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); > > - > > - while (spfn < epfn && nr_pages < nr_pages_needed) { > > - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); > > - first_deferred_pfn = min(t, epfn); > > - nr_pages += deferred_init_pages(zone, spfn, > > - first_deferred_pfn); > > - spfn = first_deferred_pfn; > > - } > > - > > - if (nr_pages >= nr_pages_needed) > > - break; > > + /* > > + * Initialize and free pages in MAX_ORDER sized increments so > > + * that we can avoid introducing any issues with the buddy > > + * allocator. > > + */ > > + while (spfn < epfn && nr_pages < nr_pages_needed) { > > + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); > > + first_deferred_pfn = spfn; > > } > > > > - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { > > - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); > > - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); > > - deferred_free_pages(spfn, epfn); > > - > > - if (first_deferred_pfn == epfn) > > - break; > > - } > > pgdat->first_deferred_pfn = first_deferred_pfn; > > pgdat_resize_unlock(pgdat, &flags); > > > > -- Sincerely yours, Mike.