Re: [PATCH 1/2] diskdump/netdump: fix segmentation fault caused by failure of stopping CPUs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

Let me share how to reproduce the issue.

Here is a step of reproducing the issue using arm64 qemu/kvm guest
machine with 4 cpus where cpu#1 is offlined, cpu#2 is failed to get
stopped and cpu#3 is panicked.

1) Prepare qemu/kvm guest machine.

   The following article is useful to install aarch64 guest:

     https://fedoraproject.org/wiki/Architectures/AArch64/Install_with_QEMU

   It looks the image available that is current latest is fc37's:

     https://mirror.umd.edu/fedora/linux/releases/37/Server/aarch64/images/

2) Configure the guest machine so it has 4 cpus.

3) Build the following kernel module as repro.ko on the guest:

repro.c:
```c
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/irqflags.h>
#include <linux/delay.h>
#include <linux/printk.h>
#include <linux/kern_levels.h>
#include <linux/kthread.h>
#include <linux/moduleparam.h>
#include <linux/kprobes.h>

int thread_function(void *id)
{
        local_irq_disable();
        for (;;)
                udelay(1);
        return 0;
}

int my_panic(void *id)
{
        panic("repro");
        return 0;
}

static int __init repro_init(void)
{
        struct task_struct *t = NULL;
        struct task_struct *tt = NULL;

        t = kthread_create_on_cpu(thread_function, NULL, 2, "repro");
        if (!t) {
                printk(KERN_INFO "Failed to create a kernel thread for repro\n");
                return -1;
        }

        tt = kthread_create_on_cpu(my_panic, NULL, 3, "mypanic");
        if (!tt) {
                printk(KERN_INFO "Failed to create a kernel thread for panic\n");
                return -1;
        }

        wake_up_process(t);
        mdelay(5000);
        wake_up_process(tt);

        return 0;
}

static void __exit repro_exit(void)
{
}

module_init(repro_init)
module_exit(repro_exit)
MODULE_LICENSE("GPL");
```

Makefile:
```
obj-m := repro.o

all:
        $(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) V=1 modules

clean:
        $(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) V=1 clean
        rm -rf *~
```

4) Run the following bash script on the guest.

   Note that installing repro.ko triggers kernel panic and then kdump.

```bash
#! /bin/bash

echo 0 > /sys/devices/system/cpu/cpu1/online
insmod repro.ko
```

5) Then, check if the issue is reproduced using the vmcore created by kdump:

    # crash vmlinux vmcore

________________________________________
From: HATAYAMA Daisuke <d.hatayama@xxxxxxxxxxx>
Sent: Friday, May 26, 2023 19:45
To: crash-utility@xxxxxxxxxx
Cc: Hatayama, Daisuke/畑山 大輔
Subject: [PATCH 1/2] diskdump/netdump: fix segmentation fault caused by failure of stopping CPUs

There's no NMI on ARM. Hence, stopping the non-panicking CPUs from the
panicking CPU via IPI can fail easily if interrupts are being masked
in those moment. Moreover, crash_notes are not initialized for such
unstopped CPUs and the corresponding NT_PRSTATUS notes are not
attached to vmcore. However, crash utility never takes it
consideration such uninitialized crash_notes and then ends with
mapping different NT_PRSTATUS to actually unstopped CPUs. This corrupt
mapping can result crash utility into segmentation fault in the
operations where register values in NT_PRSTATUS notes are used.

For example:

    crash> bt 1408
        PID: 1408     TASK: ffff000003e22200  CPU: 2    COMMAND: "repro"
        Segmentation fault (core dumped)

        crash> help -D
        diskdump_data:
                          filename: 127.0.0.1-2023-05-26-02:21:27/vmcore-ld1
                                 flags: 46 (KDUMP_CMPRS_LOCAL|ERROR_EXCLUDED|LZO_SUPPORTED)
        ...snip...
                           notes_buf: 1815df0
          num_vmcoredd_notes: 0
          num_prstatus_notes: 5
                                notes[0]: 1815df0 (NT_PRSTATUS)
                                                  si.signo: 0  si.code: 0  si.errno: 0
        ...snip...
                                                  PSTATE: 80400005   FPVALID: 00000000
                                notes[4]: 1808f10 (NT_PRSTATUS)
        Segmentation fault (core dumped)

To fix this issue, let's map NT_PRSTATUS to some CPU only if the
corresponding crash_notes is checked to be initialized.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@xxxxxxxxxxx>
---
 defs.h     |  1 +
 diskdump.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 netdump.c  |  5 ++++-
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/defs.h b/defs.h
index 12ad6aa..72129a1 100644
--- a/defs.h
+++ b/defs.h
@@ -7111,6 +7111,7 @@ int dumpfile_is_split(void);
 void show_split_dumpfiles(void);
 void x86_process_elf_notes(void *, unsigned long);
 void *diskdump_get_prstatus_percpu(int);
+int have_crash_notes(int cpu);
 void map_cpus_to_prstatus_kdump_cmprs(void);
 void diskdump_display_regs(int, FILE *);
 void process_elf32_notes(void *, ulong);
diff --git a/diskdump.c b/diskdump.c
index cf5f5d9..11d29d3 100644
--- a/diskdump.c
+++ b/diskdump.c
@@ -101,6 +101,55 @@ int dumpfile_is_split(void)
        return KDUMP_SPLIT();
 }

+int have_crash_notes(int cpu)
+{
+       ulong crash_notes, notes_ptr;
+       char *buf, *p;
+       Elf64_Nhdr *note = NULL;
+
+       if (!readmem(symbol_value("crash_notes"),
+                    KVADDR,
+                    &crash_notes,
+                    sizeof(crash_notes),
+                    "crash_notes",
+                    RETURN_ON_ERROR)) {
+               error(WARNING, "cannot read \"crash_notes\"\n");
+               return FALSE;
+       }
+
+       if (symbol_exists("__per_cpu_offset"))
+               notes_ptr = crash_notes + kt->__per_cpu_offset[cpu];
+       else
+               notes_ptr = crash_notes;
+
+       buf = GETBUF(SIZE(note_buf));
+
+       if (!readmem(notes_ptr,
+                    KVADDR,
+                    buf,
+                    SIZE(note_buf),
+                    "note_buf_t",
+                    RETURN_ON_ERROR)) {
+               error(WARNING, "cpu %d: cannot read NT_PRSTATUS note\n", cpu);
+               return FALSE;
+       }
+
+       note = (Elf64_Nhdr *)buf;
+       p = buf + sizeof(Elf64_Nhdr);
+
+       if (note->n_type != NT_PRSTATUS) {
+               error(WARNING, "cpu %d: invalid NT_PRSTATUS note (n_type != NT_PRSTATUS)\n", cpu);
+               return FALSE;
+       }
+
+       if (!STRNEQ(p, "CORE")) {
+               error(WARNING, "cpu %d: invalid NT_PRSTATUS note (name != \"CORE\")\n", cpu);
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
 void
 map_cpus_to_prstatus_kdump_cmprs(void)
 {
@@ -131,7 +180,7 @@ map_cpus_to_prstatus_kdump_cmprs(void)
        nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS);

        for (i = 0, j = 0; i < nrcpus; i++) {
-               if (in_cpu_map(ONLINE_MAP, i)) {
+               if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) {
                        dd->nt_prstatus_percpu[i] = nt_ptr[j++];
                        dd->num_prstatus_notes =
                                MAX(dd->num_prstatus_notes, i+1);
diff --git a/netdump.c b/netdump.c
index 01af145..b272984 100644
--- a/netdump.c
+++ b/netdump.c
@@ -99,8 +99,11 @@ map_cpus_to_prstatus(void)
        nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS);

        for (i = 0, j = 0; i < nrcpus; i++) {
-               if (in_cpu_map(ONLINE_MAP, i))
+               if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) {
                        nd->nt_prstatus_percpu[i] = nt_ptr[j++];
+                       nd->num_prstatus_notes =
+                               MAX(nd->num_prstatus_notes, i+1);
+               }
        }

        FREEBUF(nt_ptr);
--
2.25.1

--
Crash-utility mailing list
Crash-utility@xxxxxxxxxx
https://listman.redhat.com/mailman/listinfo/crash-utility
Contribution Guidelines: https://github.com/crash-utility/crash/wiki




[Index of Archives]     [Fedora Development]     [Fedora Desktop]     [Fedora SELinux]     [Yosemite News]     [KDE Users]     [Fedora Tools]

 

Powered by Linux