Hi Mike, good finds! I let my run with IPv6 disabled run over the weekend and after ~4 days there has been no increase in Percpu memory and MemAvailable has remained flat - so seems fairly conclusive that IPv6 was causing the remaining leak. Will be interesting to see if that fix you found resolves the issue when IPv6 is re-enabled. thanks, Dan McGinnes IBM Cloud - Containers performance Int Tel: 247359 Ext Tel: 01962 817359 Notes: Daniel McGinnes/UK/IBM Email: MCGINNES@xxxxxxxxxx IBM (UK) Ltd, Hursley Park,Winchester,Hampshire, SO21 2JN From: "Mike Rapoport" <rppt@xxxxxxxxxxxxxxxxxx> To: "Roman Gushchin" <guro@xxxxxx> Cc: Daniel McGinnes/UK/IBM@IBMGB, "cgroups@xxxxxxxxxxxxxxx" <cgroups@xxxxxxxxxxxxxxx>, "Nathaniel Rockwell" <nrockwell@xxxxxxxxxx> Date: 08/10/2018 08:05 Subject: Re: PROBLEM: Memory leaking when running kubernetes cronjobs On Sat, Oct 06, 2018 at 12:42:37AM +0000, Roman Gushchin wrote: > Hi Daniel! > > On Fri, Oct 05, 2018 at 10:16:25AM +0000, Daniel McGinnes wrote: > > Hi Roman, > > > > memory pressure was started after 1 hour (Ran stress --vm 16 --vm-bytes > > 1772864000 -t 300 for 5 minutes, then sleep for 5 mins in a continuous > > loop). > > > > Machine has 16 cores & 32 GB RAM. > > > > I think the issue I still have is that even though the per-cpu is able to > > be reused for other per-cpu allocations, my understanding is that it will > > not be available for general use by applications - so if percpu memory > > usage is growing continuously (which we still see happening pretty slowly > > - but over months it would be fairly significant) it means there will be > > less memory available for applications to use. Please let me know if I've > > mis-understood something here. > > Well, yeah, not looking good. > > > > > After seeing several stacks in IPv6 in the memory leak output I ran a test > > with IPv6 disabled on the host. Interestingly after 24 hours the Percpu > > memory reported in meminfo seems to have flattened out, whereas with IPv6 > > enabled it was still growing. MemAvailable is decreasing so slowly that I > > need to leave it longer to draw any conclusions from that. > > Looks like there is a independent per-cpu memory leak somewhere in the ipv6 > stack. Not sure, of course, but if the number of dying cgroups is not growing... There is a leak in the percpu allocator itself, it never frees some of its metadata. I've sent the fix yesterday [1], I believe it will be merged in 4.19. Also, there was a recent fix for a leak ipv6 [2]. I'm now trying to see the dynamics of the percpu allocations, so I've added yet another debugfs interface for percpu (below) similar to /proc/vmallocinfo. I hope that by the end of the day I'll be able to see what is causing to increase in percpu memory. > Id try to check that my memleak.py is actually capturing all allocs, maybe > we're missing something... > > Thanks! > [1] https://lkml.org/lkml/2018/10/7/84 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce7ea4af0838ffd4667ecad4eb5eec7a25342f1e >From 3979853b25b3cd1e0ba0d5242c2145f63afd1ad1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Date: Sun, 7 Oct 2018 19:41:17 +0300 Subject: [PATCH] percpu: stats: add info about memory users Print summary of allocation count, size and callers, similar to /proc/vmallocinfo. Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> --- mm/percpu-internal.h | 28 +++++++++++++++--- mm/percpu-stats.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/percpu.c | 28 ++++++++++++++---- 3 files changed, 128 insertions(+), 10 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index b1739dc..b336ae4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -21,10 +21,17 @@ struct pcpu_block_md { int first_free; /* block position of first free */ }; +struct pcpu_allocs_info { + size_t size; + int count; + const void *caller; +}; + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ + struct pcpu_allocs_info *allocs; #endif struct list_head list; /* linked to pcpu_slot lists */ @@ -136,8 +143,11 @@ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) * CONTEXT: * pcpu_lock. */ -static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size, + int off, const void *caller) { + int idx = off / PCPU_MIN_ALLOC_SIZE; + lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; @@ -151,6 +161,10 @@ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); + + chunk->allocs[idx].size = size; + chunk->allocs[idx].caller = caller; + chunk->allocs[idx].count = 1; } /* @@ -160,14 +174,19 @@ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) * CONTEXT: * pcpu_lock. */ -static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk, int off) { + int idx = off / PCPU_MIN_ALLOC_SIZE; + lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; + + chunk->allocs[idx].size = 0; + chunk->allocs[idx].caller = NULL; } /* @@ -204,11 +223,12 @@ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } -static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size, + int off, const void *caller) { } -static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk, int off) { } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index b5fdd43..c587393 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -44,6 +44,34 @@ static int find_max_nr_alloc(void) return max_nr_alloc; } +static void get_chunk_alloc_info(struct pcpu_chunk *chunk, + struct pcpu_allocs_info *allocs) +{ + int region_bits = pcpu_chunk_map_bits(chunk); + int i, j; + + for (i = 0; i < region_bits; i++) { + struct pcpu_allocs_info *ai = &chunk->allocs[i]; + + if (!ai->size) + continue; + + for (j = 0; allocs[j].size; j++) { + if (allocs[j].size == ai->size && + allocs[j].caller == ai->caller) { + allocs[j].count++; + break; + } + } + + if (!allocs[j].size) { + allocs[j].size = ai->size; + allocs[j].caller = ai->caller; + allocs[j].count = 1; + } + } +} + /* * Prints out chunk state. Fragmentation is considered between * the beginning of the chunk to the last allocation. @@ -225,11 +253,65 @@ static int percpu_stats_show(struct seq_file *m, void *v) } DEFINE_SHOW_ATTRIBUTE(percpu_stats); +static int percpu_users_show(struct seq_file *m, void *v) +{ + struct pcpu_allocs_info *allocs, *sum, *cur; + struct pcpu_chunk *chunk; + int unit_pages, slot_users, max_users; + int slot, i; + + unit_pages = pcpu_stats_ai.unit_size >> PAGE_SHIFT; + slot_users = pcpu_nr_pages_to_map_bits(unit_pages); + max_users = slot_users * (pcpu_nr_slots + 1); + + allocs = vzalloc(array_size(sizeof(*allocs), max_users)); + if (!allocs) + return -ENOMEM; + + spin_lock_irq(&pcpu_lock); + + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_slot[slot], list) + get_chunk_alloc_info(chunk, allocs + slot); + + if (pcpu_reserved_chunk) + get_chunk_alloc_info(pcpu_reserved_chunk, allocs + slot); + + spin_unlock_irq(&pcpu_lock); + + for (slot = 1; slot < pcpu_nr_slots + 1; slot++) { + for (i = 0; i < slot_users; i++) { + sum = &allocs[i]; + cur = &allocs[i + slot * slot_users]; + + if (sum->size == cur->size && + sum->caller == cur->caller) { + sum->count += cur->count; + cur->size = cur->count = 0; + cur->caller = NULL; + } + } + } + + for (i = 0; i < max_users; i++) + if (allocs[i].size) + seq_printf(m, "%-10ld\t%-10d\t%-30pS\n", allocs[i].size, + allocs[i].count, allocs[i].caller); + + vfree(allocs); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(percpu_users); + static int __init init_percpu_stats_debugfs(void) { debugfs_create_file("percpu_stats", 0444, NULL, NULL, &percpu_stats_fops); + debugfs_create_file("percpu_users", 0444, NULL, NULL, + &percpu_users_fops); + return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 25104cd..549b29a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1031,7 +1031,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); - pcpu_stats_area_dealloc(chunk); + pcpu_stats_area_dealloc(chunk, off); oslot = pcpu_chunk_slot(chunk); @@ -1120,6 +1120,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, sizeof(chunk->bound_map[0]), 0); chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), 0); + chunk->allocs = memblock_alloc(region_bits * sizeof(*chunk->allocs), 0); + pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1190,6 +1192,15 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) if (!chunk->md_blocks) goto md_blocks_fail; +#ifdef CONFIG_PERCPU_STATS + chunk->allocs = pcpu_mem_zalloc(region_bits * sizeof(chunk->allocs[0]), + gfp); + if (!chunk->allocs) { + pcpu_mem_free(chunk->md_blocks); + goto md_blocks_fail; + } +#endif + pcpu_init_md_blocks(chunk); /* init metadata */ @@ -1212,6 +1223,9 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; +#ifdef CONFIG_PERCPU_STATS + pcpu_mem_free(chunk->allocs); +#endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); @@ -1350,7 +1364,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * Percpu pointer to the allocated area on success, NULL on failure. */ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, - gfp_t gfp) + gfp_t gfp, const void *caller) { /* whitelisted flags that can be passed to the backing allocators */ gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); @@ -1460,7 +1474,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto restart; area_found: - pcpu_stats_area_alloc(chunk, size); + pcpu_stats_area_alloc(chunk, size, off, caller); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1543,7 +1557,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, */ void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) { - return pcpu_alloc(size, align, false, gfp); + return pcpu_alloc(size, align, false, gfp, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); @@ -1556,7 +1570,8 @@ EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false, GFP_KERNEL); + return pcpu_alloc(size, align, false, GFP_KERNEL, + __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -1578,7 +1593,8 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true, GFP_KERNEL); + return pcpu_alloc(size, align, true, GFP_KERNEL, + __builtin_return_address(0)); } /** -- 2.7.4 -- Sincerely yours, Mike. Unless stated otherwise above: IBM United Kingdom Limited - Registered in England and Wales with number 741598. Registered office: PO Box 41, North Harbour, Portsmouth, Hampshire PO6 3AU