Re: [PATCH v3 13/35] lib: add allocation tagging support for memory allocation profiling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> instrument memory allocators. It registers an "alloc_tags" codetag type
> with /proc/allocinfo interface to output allocation tag information when

Please don't add anything new to the top-level /proc directory. This
should likely live in /sys.

> the feature is enabled.
> CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> allocation profiling instrumentation.
> Memory allocation profiling can be enabled or disabled at runtime using
> /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> profiling by default.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> Co-developed-by: Kent Overstreet <kent.overstreet@xxxxxxxxx>
> Signed-off-by: Kent Overstreet <kent.overstreet@xxxxxxxxx>
> ---
>  Documentation/admin-guide/sysctl/vm.rst |  16 +++
>  Documentation/filesystems/proc.rst      |  28 +++++
>  include/asm-generic/codetag.lds.h       |  14 +++
>  include/asm-generic/vmlinux.lds.h       |   3 +
>  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
>  include/linux/sched.h                   |  24 ++++
>  lib/Kconfig.debug                       |  25 ++++
>  lib/Makefile                            |   2 +
>  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
>  scripts/module.lds.S                    |   7 ++
>  10 files changed, 410 insertions(+)
>  create mode 100644 include/asm-generic/codetag.lds.h
>  create mode 100644 include/linux/alloc_tag.h
>  create mode 100644 lib/alloc_tag.c
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index c59889de122b..a214719492ea 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
>  - legacy_va_layout
>  - lowmem_reserve_ratio
>  - max_map_count
> +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
>  - memory_failure_early_kill
>  - memory_failure_recovery
>  - min_free_kbytes
> @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
>  The default value is 65530.
>  
>  
> +mem_profiling
> +==============
> +
> +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> +
> +1: Enable memory profiling.
> +
> +0: Disabld memory profiling.
> +
> +Enabling memory profiling introduces a small performance overhead for all
> +memory allocations.
> +
> +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
> +
> +
>  memory_failure_early_kill:
>  ==========================
>  
> diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> index 104c6d047d9b..40d6d18308e4 100644
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -688,6 +688,7 @@ files are there, and which are missing.
>   ============ ===============================================================
>   File         Content
>   ============ ===============================================================
> + allocinfo    Memory allocations profiling information
>   apm          Advanced power management info
>   bootconfig   Kernel command line obtained from boot config,
>   	      and, if there were kernel parameters from the
> @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be
>  reclaimed to achieve this.
>  
>  
> +allocinfo
> +~~~~~~~
> +
> +Provides information about memory allocations at all locations in the code
> +base. Each allocation in the code is identified by its source file, line
> +number, module and the function calling the allocation. The number of bytes
> +allocated at each location is reported.
> +
> +Example output.
> +
> +::
> +
> +    > cat /proc/allocinfo
> +
> +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> +      ...
> +
> +
>  meminfo
>  ~~~~~~~
>  
> diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
> new file mode 100644
> index 000000000000..64f536b80380
> --- /dev/null
> +++ b/include/asm-generic/codetag.lds.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ASM_GENERIC_CODETAG_LDS_H
> +#define __ASM_GENERIC_CODETAG_LDS_H
> +
> +#define SECTION_WITH_BOUNDARIES(_name)	\
> +	. = ALIGN(8);			\
> +	__start_##_name = .;		\
> +	KEEP(*(_name))			\
> +	__stop_##_name = .;
> +
> +#define CODETAG_SECTIONS()		\
> +	SECTION_WITH_BOUNDARIES(alloc_tags)
> +
> +#endif /* __ASM_GENERIC_CODETAG_LDS_H */
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index 5dd3a61d673d..c9997dc50c50 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -50,6 +50,8 @@
>   *               [__nosave_begin, __nosave_end] for the nosave data
>   */
>  
> +#include <asm-generic/codetag.lds.h>
> +
>  #ifndef LOAD_OFFSET
>  #define LOAD_OFFSET 0
>  #endif
> @@ -366,6 +368,7 @@
>  	. = ALIGN(8);							\
>  	BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)		\
>  	BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)				\
> +	CODETAG_SECTIONS()						\
>  	LIKELY_PROFILE()		       				\
>  	BRANCH_PROFILE()						\
>  	TRACE_PRINTKS()							\
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> new file mode 100644
> index 000000000000..cf55a149fa84
> --- /dev/null
> +++ b/include/linux/alloc_tag.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * allocation tagging
> + */
> +#ifndef _LINUX_ALLOC_TAG_H
> +#define _LINUX_ALLOC_TAG_H
> +
> +#include <linux/bug.h>
> +#include <linux/codetag.h>
> +#include <linux/container_of.h>
> +#include <linux/preempt.h>
> +#include <asm/percpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/static_key.h>
> +
> +struct alloc_tag_counters {
> +	u64 bytes;
> +	u64 calls;
> +};
> +
> +/*
> + * An instance of this structure is created in a special ELF section at every
> + * allocation callsite. At runtime, the special section is treated as
> + * an array of these. Embedded codetag utilizes codetag framework.
> + */
> +struct alloc_tag {
> +	struct codetag			ct;
> +	struct alloc_tag_counters __percpu	*counters;
> +} __aligned(8);
> +
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +
> +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
> +{
> +	return container_of(ct, struct alloc_tag, ct);
> +}
> +
> +#ifdef ARCH_NEEDS_WEAK_PER_CPU
> +/*
> + * When percpu variables are required to be defined as weak, static percpu
> + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
> + */
> +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"

Is this enforced via Kconfig as well? (Looks like only alpha and s390?)

> +#endif
> +
> +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)					\
> +	static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);	\
> +	static struct alloc_tag _alloc_tag __used __aligned(8)			\
> +	__section("alloc_tags") = {						\
> +		.ct = CODE_TAG_INIT,						\
> +		.counters = &_alloc_tag_cntr };					\
> +	struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
> +
> +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +			mem_alloc_profiling_key);
> +
> +static inline bool mem_alloc_profiling_enabled(void)
> +{
> +	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +				   &mem_alloc_profiling_key);
> +}
> +
> +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> +{
> +	struct alloc_tag_counters v = { 0, 0 };
> +	struct alloc_tag_counters *counter;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		counter = per_cpu_ptr(tag->counters, cpu);
> +		v.bytes += counter->bytes;
> +		v.calls += counter->calls;
> +	}
> +
> +	return v;
> +}
> +
> +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> +{
> +	struct alloc_tag *tag;
> +
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> +#endif
> +	if (!ref || !ref->ct)
> +		return;
> +
> +	tag = ct_to_alloc_tag(ref->ct);
> +
> +	this_cpu_sub(tag->counters->bytes, bytes);
> +	this_cpu_dec(tag->counters->calls);
> +
> +	ref->ct = NULL;
> +}
> +
> +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> +{
> +	__alloc_tag_sub(ref, bytes);
> +}
> +
> +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
> +{
> +	__alloc_tag_sub(ref, bytes);
> +}
> +
> +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
> +{
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN_ONCE(ref && ref->ct,
> +		  "alloc_tag was not cleared (got tag for %s:%u)\n",\
> +		  ref->ct->filename, ref->ct->lineno);
> +
> +	WARN_ONCE(!tag, "current->alloc_tag not set");
> +#endif
> +	if (!ref || !tag)
> +		return;
> +
> +	ref->ct = &tag->ct;
> +	this_cpu_add(tag->counters->bytes, bytes);
> +	this_cpu_inc(tag->counters->calls);
> +}
> +
> +#else
> +
> +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
> +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
> +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
> +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
> +				 size_t bytes) {}
> +
> +#endif
> +
> +#endif /* _LINUX_ALLOC_TAG_H */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ffe8f618ab86..da68a10517c8 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -770,6 +770,10 @@ struct task_struct {
>  	unsigned int			flags;
>  	unsigned int			ptrace;
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	struct alloc_tag		*alloc_tag;
> +#endif

Normally scheduling is very sensitive to having anything early in
task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
area.

> +
>  #ifdef CONFIG_SMP
>  	int				on_cpu;
>  	struct __call_single_node	wake_entry;
> @@ -810,6 +814,7 @@ struct task_struct {
>  	struct task_group		*sched_task_group;
>  #endif
>  
> +
>  #ifdef CONFIG_UCLAMP_TASK
>  	/*
>  	 * Clamp values requested for a scheduling entity.
> @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
>  
>  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
> +{
> +	swap(current->alloc_tag, tag);
> +	return tag;
> +}
> +
> +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
> +{
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
> +#endif
> +	current->alloc_tag = old;
> +}
> +#else
> +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
> +#define alloc_tag_restore(_tag, _old)
> +#endif
> +
>  #endif
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 0be2d00c3696..78d258ca508f 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -972,6 +972,31 @@ config CODE_TAGGING
>  	bool
>  	select KALLSYMS
>  
> +config MEM_ALLOC_PROFILING
> +	bool "Enable memory allocation profiling"
> +	default n
> +	depends on PROC_FS
> +	depends on !DEBUG_FORCE_WEAK_PER_CPU
> +	select CODE_TAGGING
> +	help
> +	  Track allocation source code and record total allocation size
> +	  initiated at that code location. The mechanism can be used to track
> +	  memory leaks with a low performance and memory impact.
> +
> +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	bool "Enable memory allocation profiling by default"
> +	default y
> +	depends on MEM_ALLOC_PROFILING
> +
> +config MEM_ALLOC_PROFILING_DEBUG
> +	bool "Memory allocation profiler debugging"
> +	default n
> +	depends on MEM_ALLOC_PROFILING
> +	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	help
> +	  Adds warnings with helpful error messages for memory allocation
> +	  profiling.
> +
>  source "lib/Kconfig.kasan"
>  source "lib/Kconfig.kfence"
>  source "lib/Kconfig.kmsan"
> diff --git a/lib/Makefile b/lib/Makefile
> index 6b48b22fdfac..859112f09bf5 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
>  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>  
>  obj-$(CONFIG_CODE_TAGGING) += codetag.o
> +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
> +
>  lib-$(CONFIG_GENERIC_BUG) += bug.o
>  
>  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> new file mode 100644
> index 000000000000..4fc031f9cefd
> --- /dev/null
> +++ b/lib/alloc_tag.c
> @@ -0,0 +1,158 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/alloc_tag.h>
> +#include <linux/fs.h>
> +#include <linux/gfp.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_buf.h>
> +#include <linux/seq_file.h>
> +
> +static struct codetag_type *alloc_tag_cttype;
> +
> +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +			mem_alloc_profiling_key);
> +
> +static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> +{
> +	struct codetag_iterator *iter;
> +	struct codetag *ct;
> +	loff_t node = *pos;
> +
> +	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> +	m->private = iter;
> +	if (!iter)
> +		return NULL;
> +
> +	codetag_lock_module_list(alloc_tag_cttype, true);
> +	*iter = codetag_get_ct_iter(alloc_tag_cttype);
> +	while ((ct = codetag_next_ct(iter)) != NULL && node)
> +		node--;
> +
> +	return ct ? iter : NULL;
> +}
> +
> +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> +	struct codetag *ct = codetag_next_ct(iter);
> +
> +	(*pos)++;
> +	if (!ct)
> +		return NULL;
> +
> +	return iter;
> +}
> +
> +static void allocinfo_stop(struct seq_file *m, void *arg)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
> +
> +	if (iter) {
> +		codetag_lock_module_list(alloc_tag_cttype, false);
> +		kfree(iter);
> +	}
> +}
> +
> +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> +{
> +	struct alloc_tag *tag = ct_to_alloc_tag(ct);
> +	struct alloc_tag_counters counter = alloc_tag_read(tag);
> +	s64 bytes = counter.bytes;
> +	char val[10], *p = val;
> +
> +	if (bytes < 0) {
> +		*p++ = '-';
> +		bytes = -bytes;
> +	}
> +
> +	string_get_size(bytes, 1,
> +			STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
> +			p, val + ARRAY_SIZE(val) - p);
> +
> +	seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
> +	codetag_to_text(out, ct);
> +	seq_buf_putc(out, ' ');
> +	seq_buf_putc(out, '\n');
> +}

/me does happy seq_buf dance!

> +
> +static int allocinfo_show(struct seq_file *m, void *arg)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> +	char *bufp;
> +	size_t n = seq_get_buf(m, &bufp);
> +	struct seq_buf buf;
> +
> +	seq_buf_init(&buf, bufp, n);
> +	alloc_tag_to_text(&buf, iter->ct);
> +	seq_commit(m, seq_buf_used(&buf));
> +	return 0;
> +}
> +
> +static const struct seq_operations allocinfo_seq_op = {
> +	.start	= allocinfo_start,
> +	.next	= allocinfo_next,
> +	.stop	= allocinfo_stop,
> +	.show	= allocinfo_show,
> +};
> +
> +static void __init procfs_init(void)
> +{
> +	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> +}

As mentioned, this really should be in /sys somewhere.

> +
> +static bool alloc_tag_module_unload(struct codetag_type *cttype,
> +				    struct codetag_module *cmod)
> +{
> +	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
> +	struct alloc_tag_counters counter;
> +	bool module_unused = true;
> +	struct alloc_tag *tag;
> +	struct codetag *ct;
> +
> +	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> +		if (iter.cmod != cmod)
> +			continue;
> +
> +		tag = ct_to_alloc_tag(ct);
> +		counter = alloc_tag_read(tag);
> +
> +		if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
> +			  ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> +			module_unused = false;
> +	}
> +
> +	return module_unused;
> +}
> +
> +static struct ctl_table memory_allocation_profiling_sysctls[] = {
> +	{
> +		.procname	= "mem_profiling",
> +		.data		= &mem_alloc_profiling_key,
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +		.mode		= 0444,
> +#else
> +		.mode		= 0644,
> +#endif
> +		.proc_handler	= proc_do_static_key,
> +	},
> +	{ }
> +};
> +
> +static int __init alloc_tag_init(void)
> +{
> +	const struct codetag_type_desc desc = {
> +		.section	= "alloc_tags",
> +		.tag_size	= sizeof(struct alloc_tag),
> +		.module_unload	= alloc_tag_module_unload,
> +	};
> +
> +	alloc_tag_cttype = codetag_register_type(&desc);
> +	if (IS_ERR_OR_NULL(alloc_tag_cttype))
> +		return PTR_ERR(alloc_tag_cttype);
> +
> +	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
> +	procfs_init();
> +
> +	return 0;
> +}
> +module_init(alloc_tag_init);
> diff --git a/scripts/module.lds.S b/scripts/module.lds.S
> index bf5bcf2836d8..45c67a0994f3 100644
> --- a/scripts/module.lds.S
> +++ b/scripts/module.lds.S
> @@ -9,6 +9,8 @@
>  #define DISCARD_EH_FRAME	*(.eh_frame)
>  #endif
>  
> +#include <asm-generic/codetag.lds.h>
> +
>  SECTIONS {
>  	/DISCARD/ : {
>  		*(.discard)
> @@ -47,12 +49,17 @@ SECTIONS {
>  	.data : {
>  		*(.data .data.[0-9a-zA-Z_]*)
>  		*(.data..L*)
> +		CODETAG_SECTIONS()
>  	}
>  
>  	.rodata : {
>  		*(.rodata .rodata.[0-9a-zA-Z_]*)
>  		*(.rodata..L*)
>  	}
> +#else
> +	.data : {
> +		CODETAG_SECTIONS()
> +	}
>  #endif
>  }

Otherwise, looks good.

-- 
Kees Cook




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux