Re: [PATCH v2 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, May 07, 2024 at 03:23:31PM +0530, Vignesh Balasubramanian wrote:
> Add a new .note section containing type, size, offset and flags of
> every xfeature that is present.
> 
> This information will be used by the debuggers to understand the XSAVE
> layout of the machine where the core file is dumped, and to read XSAVE
> registers, especially during cross-platform debugging.
> 
> Some background:
> 
> The XSAVE layouts of modern AMD and Intel CPUs differ, especially since
> Memory Protection Keys and the AVX-512 features have been inculcated into
> the AMD CPUs.
> This is since AMD never adopted (and hence never left room in the XSAVE
> layout for) the Intel MPX feature. Tools like GDB had assumed a fixed XSAVE
> layout matching that of Intel (based on the XCR0 mask).
> Hence, the core dumps from AMD CPUs didn't match the known size for the
> XCR0 mask. This resulted in GDB and other tools not being able to access
> the values of the AVX-512 and PKRU registers on AMD CPUs.
> To solve this, an interim solution has been accepted into GDB, and is
> already a part of GDB 14, thanks to these series of patches
> [ https://sourceware.org/pipermail/gdb-patches/2023-March/198081.html ].
> But this patch series depends on heuristics based on the total XSAVE
> register set size and the XCR0 mask to infer the layouts of the various
> register blocks for core dumps, and hence, is not a foolproof mechanism to
> determine the layout of the XSAVE area.
> 
> Hence this new core dump note has been proposed as a more sturdy mechanism
> to allow GDB/LLDB and other relevant tools to determine the layout of the
> XSAVE area of the machine where the corefile was dumped.
> The new core dump note (which is being proposed as a per-process .note
> section), NT_X86_XSAVE_LAYOUT (0x205) contains an array of structures.
> Each structure describes an individual extended feature containing offset,
> size and flags (that is obtained through CPUID instruction) in a format
> roughly matching the follow C structure:
> 
> struct xfeat_component {
>        u32 xfeat_type;
>        u32 xfeat_sz;
>        u32 xfeat_off;
>        u32 xfeat_flags;
> };
> 
> Co-developed-by: Jini Susan George <jinisusan.george@xxxxxxx>
> Signed-off-by: Jini Susan George <jinisusan.george@xxxxxxx>
> Signed-off-by: Vignesh Balasubramanian <vigbalas@xxxxxxx>
> ---
> v1->v2: Removed kernel internal defn dependency, code improvements
> 
>  arch/x86/Kconfig             |   1 +
>  arch/x86/include/asm/elf.h   |  34 +++++++++
>  arch/x86/kernel/fpu/xstate.c | 141 +++++++++++++++++++++++++++++++++++
>  fs/binfmt_elf.c              |   4 +-
>  include/uapi/linux/elf.h     |   1 +
>  5 files changed, 179 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 928820e61cb5..cc67daab3396 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -105,6 +105,7 @@ config X86
>  	select ARCH_HAS_DEBUG_WX
>  	select ARCH_HAS_ZONE_DMA_SET if EXPERT
>  	select ARCH_HAVE_NMI_SAFE_CMPXCHG
> +	select ARCH_HAVE_EXTRA_ELF_NOTES
>  	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>  	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
>  	select ARCH_MIGHT_HAVE_PC_PARPORT
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 1fb83d47711f..5952574db64b 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -13,6 +13,40 @@
>  #include <asm/auxvec.h>
>  #include <asm/fsgsbase.h>
>  
> +struct xfeat_component {
> +	u32 xfeat_type;
> +	u32 xfeat_sz;
> +	u32 xfeat_off;
> +	u32 xfeat_flags;
> +} __packed;
> +
> +_Static_assert(sizeof(struct xfeat_component)%4 == 0, "xfeat_component is not aligned");
> +
> +enum custom_feature {
> +	FEATURE_XSAVE_FP = 0,
> +	FEATURE_XSAVE_SSE = 1,
> +	FEATURE_XSAVE_YMM = 2,
> +	FEATURE_XSAVE_BNDREGS = 3,
> +	FEATURE_XSAVE_BNDCSR = 4,
> +	FEATURE_XSAVE_OPMASK = 5,
> +	FEATURE_XSAVE_ZMM_Hi256 = 6,
> +	FEATURE_XSAVE_Hi16_ZMM = 7,
> +	FEATURE_XSAVE_PT = 8,
> +	FEATURE_XSAVE_PKRU = 9,
> +	FEATURE_XSAVE_PASID = 10,
> +	FEATURE_XSAVE_CET_USER = 11,
> +	FEATURE_XSAVE_CET_SHADOW_STACK = 12,
> +	FEATURE_XSAVE_HDC = 13,
> +	FEATURE_XSAVE_UINTR = 14,
> +	FEATURE_XSAVE_LBR = 15,
> +	FEATURE_XSAVE_HWP = 16,
> +	FEATURE_XSAVE_XTILE_CFG = 17,
> +	FEATURE_XSAVE_XTILE_DATA = 18,
> +	FEATURE_MAX,
> +	FEATURE_XSAVE_EXTENDED_START = FEATURE_XSAVE_YMM,
> +	FEATURE_XSAVE_EXTENDED_END = FEATURE_XSAVE_XTILE_DATA,
> +};
> +
>  typedef unsigned long elf_greg_t;
>  
>  #define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index 33a214b1a4ce..3d1c3c96e34d 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -13,6 +13,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/proc_fs.h>
>  #include <linux/vmalloc.h>
> +#include <linux/coredump.h>
>  
>  #include <asm/fpu/api.h>
>  #include <asm/fpu/regset.h>
> @@ -87,6 +88,8 @@ static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
>  #define XSTATE_FLAG_SUPERVISOR	BIT(0)
>  #define XSTATE_FLAG_ALIGNED64	BIT(1)
>  
> +static const char owner_name[] = "LINUX";

This needs to move under the CONFIG_COREDUMP below (so says the build
bots).

> +
>  /*
>   * Return whether the system supports a given xfeature.
>   *
> @@ -1837,3 +1840,141 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
>  	return 0;
>  }
>  #endif /* CONFIG_PROC_PID_ARCH_STATUS */
> +
> +#ifdef CONFIG_COREDUMP
> +static int get_sub_leaf(int custom_xfeat)

Why is this "int"? I don't imagine there are negative features?

> +{
> +	switch (custom_xfeat) {
> +	case FEATURE_XSAVE_YMM:			return XFEATURE_YMM;
> +	case FEATURE_XSAVE_BNDREGS:		return XFEATURE_BNDREGS;
> +	case FEATURE_XSAVE_BNDCSR:		return XFEATURE_BNDCSR;
> +	case FEATURE_XSAVE_OPMASK:		return XFEATURE_OPMASK;
> +	case FEATURE_XSAVE_ZMM_Hi256:		return XFEATURE_ZMM_Hi256;
> +	case FEATURE_XSAVE_Hi16_ZMM:		return XFEATURE_Hi16_ZMM;
> +	case FEATURE_XSAVE_PT:			return XFEATURE_PT_UNIMPLEMENTED_SO_FAR;
> +	case FEATURE_XSAVE_PKRU:		return XFEATURE_PKRU;
> +	case FEATURE_XSAVE_PASID:		return XFEATURE_PASID;
> +	case FEATURE_XSAVE_CET_USER:		return XFEATURE_CET_USER;
> +	case FEATURE_XSAVE_CET_SHADOW_STACK:	return XFEATURE_CET_KERNEL_UNUSED;
> +	case FEATURE_XSAVE_HDC:			return XFEATURE_RSRVD_COMP_13;
> +	case FEATURE_XSAVE_UINTR:		return XFEATURE_RSRVD_COMP_14;
> +	case FEATURE_XSAVE_LBR:			return XFEATURE_LBR;
> +	case FEATURE_XSAVE_HWP:			return XFEATURE_RSRVD_COMP_16;
> +	case FEATURE_XSAVE_XTILE_CFG:		return XFEATURE_XTILE_CFG;
> +	case FEATURE_XSAVE_XTILE_DATA:		return XFEATURE_XTILE_DATA;
> +	default:
> +		pr_warn_ratelimited("Not a valid XSAVE Feature.");

This isn't very friendly; it's keeping secrets about the unknown value. :)
Also it's missing a newline. How about:

		pr_warn_ratelimited("Not a known XSAVE Feature: %u\n",
				    custom_xfeat);

> +		return 0;
> +	}
> +}
> +
> +/*
> + * Dump type, size, offset and flag values for every xfeature that is present.
> + */
> +static int dump_xsave_layout_desc(struct coredump_params *cprm)
> +{
> +	u32 supported_features = 0;
> +	struct xfeat_component xc;
> +	u32 eax, ebx, ecx, edx;
> +	int num_records = 0;
> +	int sub_leaf = 0;
> +	int i;
> +
> +	/* Find supported extended features */
> +	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +	supported_features = eax;
> +
> +	for (i = FEATURE_XSAVE_EXTENDED_START;
> +			i <= FEATURE_XSAVE_EXTENDED_END; i++) {
> +		sub_leaf = get_sub_leaf(i);
> +		if (!sub_leaf)
> +			continue;
> +		if (supported_features & (1U << sub_leaf)) {
> +			cpuid_count(XSTATE_CPUID, sub_leaf, &eax, &ebx, &ecx, &edx);
> +			xc.xfeat_type = i;
> +			xc.xfeat_sz = eax;
> +			xc.xfeat_off = ebx;
> +			/* Reserved for future use */
> +			xc.xfeat_flags = 0;
> +
> +			if (!dump_emit(cprm, &xc,
> +				       sizeof(struct xfeat_component)))
> +				return 0;
> +			num_records++;
> +		}
> +	}
> +
> +	return num_records;
> +}
> +
> +static int get_xsave_desc_size(void)

This can return u32: never negative.

> +{
> +	int supported_features = 0;
> +	int xfeatures_count = 0;
> +	u32 eax, ebx, ecx, edx;
> +	int sub_leaf = 0;
> +	int i;

"i" can be u32 and then we can fix the get_sub_leaf() arg type.

> +
> +	/* Find supported extended features */
> +	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +	supported_features = eax;
> +
> +	for (i = FEATURE_XSAVE_EXTENDED_START;
> +			i <= FEATURE_XSAVE_EXTENDED_END; i++) {
> +		sub_leaf = get_sub_leaf(i);
> +		if (!sub_leaf)
> +			continue;
> +		if (supported_features & (1U << sub_leaf))
> +			xfeatures_count++;
> +	}
> +
> +	return xfeatures_count * (sizeof(struct xfeat_component));
> +}
> +
> +int elf_coredump_extra_notes_write(struct coredump_params *cprm)
> +{
> +	int num_records = 0;
> +	struct elf_note en;
> +
> +	en.n_namesz = sizeof(owner_name);
> +	en.n_descsz = get_xsave_desc_size();
> +	en.n_type = NT_X86_XSAVE_LAYOUT;
> +
> +	if (!dump_emit(cprm, &en, sizeof(en)))
> +		return 1;
> +	if (!dump_emit(cprm, owner_name, en.n_namesz))
> +		return 1;
> +	if (!dump_align(cprm, 4))
> +		return 1;
> +
> +	num_records = dump_xsave_layout_desc(cprm);
> +	if (!num_records) {
> +		pr_warn_ratelimited("Error adding XSTATE layout ELF note. XSTATE buffer in the core file will be unparseable.");

Missing trailing newline.

> +		return 1;
> +	}
> +
> +	/* Total size should be equal to the number of records */
> +	if ((sizeof(struct xfeat_component) * num_records) != en.n_descsz) {
> +		pr_warn_ratelimited("Error adding XSTATE layout ELF note. The size of the .note section does not match with the total size of the records.");

Same.

> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Return the size of new note.
> + */
> +int elf_coredump_extra_notes_size(void)
> +{
> +	int size = 0;
> +
> +	/* NOTE Header */
> +	size += sizeof(struct elf_note);
> +	/* name + align */
> +	size += roundup(sizeof(owner_name), 4);
> +	size += get_xsave_desc_size();
> +
> +	return size;
> +}
> +#endif

Since it's a long if/endif, add: /* CONFIG_COREDUMP */ after the endif
here.

> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 5397b552fbeb..833bcb7e957b 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -2000,7 +2000,7 @@ static int elf_core_dump(struct coredump_params *cprm)
>  	{
>  		size_t sz = info.size;
>  
> -		/* For cell spufs */
> +		/* For cell spufs and x86 xstate */
>  		sz += elf_coredump_extra_notes_size();
>  
>  		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
> @@ -2064,7 +2064,7 @@ static int elf_core_dump(struct coredump_params *cprm)
>  	if (!write_note_info(&info, cprm))
>  		goto end_coredump;
>  
> -	/* For cell spufs */
> +	/* For cell spufs and x86 xstate */
>  	if (elf_coredump_extra_notes_write(cprm))
>  		goto end_coredump;
>  
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b54b313bcf07..e30a9b47dc87 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -411,6 +411,7 @@ typedef struct elf64_shdr {
>  #define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
>  /* Old binutils treats 0x203 as a CET state */
>  #define NT_X86_SHSTK	0x204		/* x86 SHSTK state */
> +#define NT_X86_XSAVE_LAYOUT	0x205	/* XSAVE layout description */
>  #define NT_S390_HIGH_GPRS	0x300	/* s390 upper register halves */
>  #define NT_S390_TIMER	0x301		/* s390 timer register */
>  #define NT_S390_TODCMP	0x302		/* s390 TOD clock comparator register */
> -- 
> 2.34.1
> 

Otherwise looks good. I'd like to see feedback from Intel folks too.

Thanks for working on this!

-Kees

-- 
Kees Cook




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux