Hi, Let me share how to reproduce the issue. Here is a step of reproducing the issue using arm64 qemu/kvm guest machine with 4 cpus where cpu#1 is offlined, cpu#2 is failed to get stopped and cpu#3 is panicked. 1) Prepare qemu/kvm guest machine. The following article is useful to install aarch64 guest: https://fedoraproject.org/wiki/Architectures/AArch64/Install_with_QEMU It looks the image available that is current latest is fc37's: https://mirror.umd.edu/fedora/linux/releases/37/Server/aarch64/images/ 2) Configure the guest machine so it has 4 cpus. 3) Build the following kernel module as repro.ko on the guest: repro.c: ```c #include <linux/kernel.h> #include <linux/module.h> #include <linux/irqflags.h> #include <linux/delay.h> #include <linux/printk.h> #include <linux/kern_levels.h> #include <linux/kthread.h> #include <linux/moduleparam.h> #include <linux/kprobes.h> int thread_function(void *id) { local_irq_disable(); for (;;) udelay(1); return 0; } int my_panic(void *id) { panic("repro"); return 0; } static int __init repro_init(void) { struct task_struct *t = NULL; struct task_struct *tt = NULL; t = kthread_create_on_cpu(thread_function, NULL, 2, "repro"); if (!t) { printk(KERN_INFO "Failed to create a kernel thread for repro\n"); return -1; } tt = kthread_create_on_cpu(my_panic, NULL, 3, "mypanic"); if (!tt) { printk(KERN_INFO "Failed to create a kernel thread for panic\n"); return -1; } wake_up_process(t); mdelay(5000); wake_up_process(tt); return 0; } static void __exit repro_exit(void) { } module_init(repro_init) module_exit(repro_exit) MODULE_LICENSE("GPL"); ``` Makefile: ``` obj-m := repro.o all: $(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) V=1 modules clean: $(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) V=1 clean rm -rf *~ ``` 4) Run the following bash script on the guest. Note that installing repro.ko triggers kernel panic and then kdump. ```bash #! /bin/bash echo 0 > /sys/devices/system/cpu/cpu1/online insmod repro.ko ``` 5) Then, check if the issue is reproduced using the vmcore created by kdump: # crash vmlinux vmcore ________________________________________ From: HATAYAMA Daisuke <d.hatayama@xxxxxxxxxxx> Sent: Friday, May 26, 2023 19:45 To: crash-utility@xxxxxxxxxx Cc: Hatayama, Daisuke/畑山 大輔 Subject: [PATCH 1/2] diskdump/netdump: fix segmentation fault caused by failure of stopping CPUs There's no NMI on ARM. Hence, stopping the non-panicking CPUs from the panicking CPU via IPI can fail easily if interrupts are being masked in those moment. Moreover, crash_notes are not initialized for such unstopped CPUs and the corresponding NT_PRSTATUS notes are not attached to vmcore. However, crash utility never takes it consideration such uninitialized crash_notes and then ends with mapping different NT_PRSTATUS to actually unstopped CPUs. This corrupt mapping can result crash utility into segmentation fault in the operations where register values in NT_PRSTATUS notes are used. For example: crash> bt 1408 PID: 1408 TASK: ffff000003e22200 CPU: 2 COMMAND: "repro" Segmentation fault (core dumped) crash> help -D diskdump_data: filename: 127.0.0.1-2023-05-26-02:21:27/vmcore-ld1 flags: 46 (KDUMP_CMPRS_LOCAL|ERROR_EXCLUDED|LZO_SUPPORTED) ...snip... notes_buf: 1815df0 num_vmcoredd_notes: 0 num_prstatus_notes: 5 notes[0]: 1815df0 (NT_PRSTATUS) si.signo: 0 si.code: 0 si.errno: 0 ...snip... PSTATE: 80400005 FPVALID: 00000000 notes[4]: 1808f10 (NT_PRSTATUS) Segmentation fault (core dumped) To fix this issue, let's map NT_PRSTATUS to some CPU only if the corresponding crash_notes is checked to be initialized. Signed-off-by: HATAYAMA Daisuke <d.hatayama@xxxxxxxxxxx> --- defs.h | 1 + diskdump.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- netdump.c | 5 ++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/defs.h b/defs.h index 12ad6aa..72129a1 100644 --- a/defs.h +++ b/defs.h @@ -7111,6 +7111,7 @@ int dumpfile_is_split(void); void show_split_dumpfiles(void); void x86_process_elf_notes(void *, unsigned long); void *diskdump_get_prstatus_percpu(int); +int have_crash_notes(int cpu); void map_cpus_to_prstatus_kdump_cmprs(void); void diskdump_display_regs(int, FILE *); void process_elf32_notes(void *, ulong); diff --git a/diskdump.c b/diskdump.c index cf5f5d9..11d29d3 100644 --- a/diskdump.c +++ b/diskdump.c @@ -101,6 +101,55 @@ int dumpfile_is_split(void) return KDUMP_SPLIT(); } +int have_crash_notes(int cpu) +{ + ulong crash_notes, notes_ptr; + char *buf, *p; + Elf64_Nhdr *note = NULL; + + if (!readmem(symbol_value("crash_notes"), + KVADDR, + &crash_notes, + sizeof(crash_notes), + "crash_notes", + RETURN_ON_ERROR)) { + error(WARNING, "cannot read \"crash_notes\"\n"); + return FALSE; + } + + if (symbol_exists("__per_cpu_offset")) + notes_ptr = crash_notes + kt->__per_cpu_offset[cpu]; + else + notes_ptr = crash_notes; + + buf = GETBUF(SIZE(note_buf)); + + if (!readmem(notes_ptr, + KVADDR, + buf, + SIZE(note_buf), + "note_buf_t", + RETURN_ON_ERROR)) { + error(WARNING, "cpu %d: cannot read NT_PRSTATUS note\n", cpu); + return FALSE; + } + + note = (Elf64_Nhdr *)buf; + p = buf + sizeof(Elf64_Nhdr); + + if (note->n_type != NT_PRSTATUS) { + error(WARNING, "cpu %d: invalid NT_PRSTATUS note (n_type != NT_PRSTATUS)\n", cpu); + return FALSE; + } + + if (!STRNEQ(p, "CORE")) { + error(WARNING, "cpu %d: invalid NT_PRSTATUS note (name != \"CORE\")\n", cpu); + return FALSE; + } + + return TRUE; +} + void map_cpus_to_prstatus_kdump_cmprs(void) { @@ -131,7 +180,7 @@ map_cpus_to_prstatus_kdump_cmprs(void) nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS); for (i = 0, j = 0; i < nrcpus; i++) { - if (in_cpu_map(ONLINE_MAP, i)) { + if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) { dd->nt_prstatus_percpu[i] = nt_ptr[j++]; dd->num_prstatus_notes = MAX(dd->num_prstatus_notes, i+1); diff --git a/netdump.c b/netdump.c index 01af145..b272984 100644 --- a/netdump.c +++ b/netdump.c @@ -99,8 +99,11 @@ map_cpus_to_prstatus(void) nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS); for (i = 0, j = 0; i < nrcpus; i++) { - if (in_cpu_map(ONLINE_MAP, i)) + if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) { nd->nt_prstatus_percpu[i] = nt_ptr[j++]; + nd->num_prstatus_notes = + MAX(nd->num_prstatus_notes, i+1); + } } FREEBUF(nt_ptr); -- 2.25.1 -- Crash-utility mailing list Crash-utility@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/crash-utility Contribution Guidelines: https://github.com/crash-utility/crash/wiki