To return unused memory to the system schedule an async depopulation of percpu chunks. To balance between scanning too much and creating an overhead because of the pcpu_lock contention and scanning not enough, let's track an amount of chunks to scan and mark chunks which are potentially a good target for the depopulation with a new boolean flag. The async depopulation work will clear the flag after trying to depopulate a chunk (successfully or not). This commit suggest the following logic: if a chunk 1) has more than 1/4 of total pages free and populated 2) isn't a reserved chunk 3) isn't entirely free 4) isn't alone in the corresponding slot it's a good target for depopulation. If there are 2 or more of such chunks, an async depopulation is scheduled. Because chunk population and depopulation are opposite processes which make a little sense together, split out the shrinking part of pcpu_balance_populated() into pcpu_grow_populated() and make pcpu_balance_populated() calling into pcpu_grow_populated() or pcpu_shrink_populated() conditionally. Signed-off-by: Roman Gushchin <guro@xxxxxx> --- mm/percpu-internal.h | 1 + mm/percpu.c | 111 ++++++++++++++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 18b768ac7dca..1c5b92af02eb 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -67,6 +67,7 @@ struct pcpu_chunk { void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ + bool depopulate; /* depopulation hint */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ diff --git a/mm/percpu.c b/mm/percpu.c index 015d076893f5..148137f0fc0b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -178,6 +178,12 @@ static LIST_HEAD(pcpu_map_extend_chunks); */ int pcpu_nr_empty_pop_pages; +/* + * Track the number of chunks with a lot of free memory. + * It's used to release unused pages to the system. + */ +static int pcpu_nr_chunks_to_depopulate; + /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets @@ -1955,6 +1961,11 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; + if (chunk->depopulate) { + chunk->depopulate = false; + pcpu_nr_chunks_to_depopulate--; + } + list_move(&chunk->list, &to_free); } @@ -1976,7 +1987,7 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) } /** - * pcpu_balance_populated - manage the amount of populated pages + * pcpu_grow_populated - populate chunk(s) to satisfy atomic allocations * @type: chunk type * * Maintain a certain amount of populated pages to satisfy atomic allocations. @@ -1985,35 +1996,15 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. */ -static void pcpu_balance_populated(enum pcpu_chunk_type type) +static void pcpu_grow_populated(enum pcpu_chunk_type type, int nr_to_pop) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct list_head *pcpu_slot = pcpu_chunk_list(type); struct pcpu_chunk *chunk; - int slot, nr_to_pop, ret; + int slot, ret; - /* - * Ensure there are certain number of free populated pages for - * atomic allocs. Fill up from the most packed so that atomic - * allocs don't increase fragmentation. If atomic allocation - * failed previously, always populate the maximum amount. This - * should prevent atomic allocs larger than PAGE_SIZE from keeping - * failing indefinitely; however, large atomic allocs are not - * something we support properly and can be highly unreliable and - * inefficient. - */ retry_pop: - if (pcpu_atomic_alloc_failed) { - nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; - /* best effort anyway, don't worry about synchronization */ - pcpu_atomic_alloc_failed = false; - } else { - nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - - pcpu_nr_empty_pop_pages, - 0, PCPU_EMPTY_POP_PAGES_HIGH); - } - for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { unsigned int nr_unpop = 0, rs, re; @@ -2084,9 +2075,18 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type) list_for_each_entry(chunk, &pcpu_slot[slot], list) { bool isolated = false; - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH || + pcpu_nr_chunks_to_depopulate < 1) break; + /* + * Don't try to depopulate a chunk again and again. + */ + if (!chunk->depopulate) + continue; + chunk->depopulate = false; + pcpu_nr_chunks_to_depopulate--; + for (i = 0, start = -1; i < chunk->nr_pages; i++) { if (!chunk->nr_empty_pop_pages) break; @@ -2153,6 +2153,41 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type) spin_unlock_irq(&pcpu_lock); } +/** + * pcpu_balance_populated - manage the amount of populated pages + * @type: chunk type + * + * Populate or depopulate chunks to maintain a certain amount + * of free pages to satisfy atomic allocations, but not waste + * large amounts of memory. + */ +static void pcpu_balance_populated(enum pcpu_chunk_type type) +{ + int nr_to_pop; + + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + pcpu_grow_populated(type, nr_to_pop); + } else if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages; + pcpu_grow_populated(type, nr_to_pop); + } else if (pcpu_nr_chunks_to_depopulate > 0) { + pcpu_shrink_populated(type); + } +} + /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused @@ -2188,6 +2223,7 @@ void free_percpu(void __percpu *ptr) int size, off; bool need_balance = false; struct list_head *pcpu_slot; + struct pcpu_chunk *pos; if (!ptr) return; @@ -2207,15 +2243,36 @@ void free_percpu(void __percpu *ptr) pcpu_memcg_free_hook(chunk, off, size); - /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_bytes == pcpu_unit_size) { - struct pcpu_chunk *pos; - + /* + * If there are more than one fully free chunks, + * wake up grim reaper. + */ list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { need_balance = true; break; } + + } else if (chunk->nr_empty_pop_pages > chunk->nr_pages / 4) { + /* + * If there is more than one chunk in the slot and + * at least 1/4 of its pages are empty, mark the chunk + * as a target for the depopulation. If there is more + * than one chunk like this, schedule an async balancing. + */ + int nslot = pcpu_chunk_slot(chunk); + + list_for_each_entry(pos, &pcpu_slot[nslot], list) + if (pos != chunk && !chunk->depopulate && + !chunk->immutable) { + chunk->depopulate = true; + pcpu_nr_chunks_to_depopulate++; + break; + } + + if (pcpu_nr_chunks_to_depopulate > 1) + need_balance = true; } trace_percpu_free_percpu(chunk->base_addr, off, ptr); -- 2.30.2