[Crash-utility] Re: [PATCH v9 1/6] ppc64: correct gdb passthroughs by implementing machdep->get_cpu_reg

Tao Liu <ltao@xxxxxxxxxx> · Tue, 27 Feb 2024 17:37:05 +0800

On Thu, Feb 22, 2024 at 10:52:56AM +0530, Aditya Gupta wrote:
> Currently, gdb passthroughs of 'bt', 'frame', 'up', 'down', 'info
> locals' don't work. This is due to gdb not knowing the register values to
> unwind the stack frames
> 
> Every gdb passthrough goes through `gdb_interface`. And then, gdb expects
> `crash_target::fetch_registers` to give it the register values, which is
> dependent on `machdep->get_cpu_reg` to read the register values for
> specific architecture.
> 
>                                       ----------------------------
>            gdb passthrough (eg. "bt") |                          |
>    crash   -------------------------> |                          |
>                                       |      gdb_interface       |
>                                       |                          |
>                                       |                          |
>                                       |  ----------------------  |
>                  fetch_registers      |  |                    |  |
> crash_target<-------------------------+--|        gdb         |  |
>             --------------------------+->|                    |  |
>               Registers (SP,NIP, etc.)|  |                    |  |
>                                       |  |                    |  |
>                                       |  ----------------------  |
>                                       ----------------------------
> 
> Implement `machdep->get_cpu_reg` on PPC64, so that crash provides the
> register values to gdb to unwind stack frames properly
> 
> With these changes, on powerpc, 'bt' command output in gdb mode, will look
> like this:
> 
>     gdb> bt
>     #0  0xc0000000002a53e8 in crash_setup_regs (oldregs=<optimized out>, newregs=0xc00000000486f8d8) at ./arch/powerpc/include/asm/kexec.h:69
>     #1  __crash_kexec (regs=<optimized out>) at kernel/kexec_core.c:974
>     #2  0xc000000000168918 in panic (fmt=<optimized out>) at kernel/panic.c:358
>     #3  0xc000000000b735f8 in sysrq_handle_crash (key=<optimized out>) at drivers/tty/sysrq.c:155
>     #4  0xc000000000b742cc in __handle_sysrq (key=key@entry=99, check_mask=check_mask@entry=false) at drivers/tty/sysrq.c:602
>     #5  0xc000000000b7506c in write_sysrq_trigger (file=<optimized out>, buf=<optimized out>, count=2, ppos=<optimized out>) at drivers/tty/sysrq.c:1163
>     #6  0xc00000000069a7bc in pde_write (ppos=<optimized out>, count=<optimized out>, buf=<optimized out>, file=<optimized out>, pde=0xc000000009ed3a80) at fs/proc/inode.c:340
>     #7  proc_reg_write (file=<optimized out>, buf=<optimized out>, count=<optimized out>, ppos=<optimized out>) at fs/proc/inode.c:352
>     #8  0xc0000000005b3bbc in vfs_write (file=file@entry=0xc00000009dda7d00, buf=buf@entry=0xebcfc7c6040 <error: Cannot access memory at address 0xebcfc7c6040>, count=count@entry=2, pos=pos@entry=0xc00000000486fda0) at fs/read_write.c:582
> 
> instead of earlier output without this patch:
> 
>     gdb> bt
>     #0  <unavailable> in ?? ()
>     Backtrace stopped: previous frame identical to this frame (corrupt stack?)
> 
> Also, 'get_dumpfile_regs' has been introduced to get registers from
> multiple supported vmcore formats. Correspondingly a flag 'BT_NO_PRINT_REGS'
> has been introduced to tell helper functions to get registers, to not
> print registers with every call to backtrace in gdb.
> 
>  Note: This feature to support GDB unwinding doesn't support live debugging
> 
> Signed-off-by: Aditya Gupta <adityag@xxxxxxxxxxxxx>
> ---
>  defs.h   | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel.c |  33 +++++++++++++++
>  ppc64.c  | 111 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 263 insertions(+), 3 deletions(-)
> 
> diff --git a/defs.h b/defs.h
> index 98650e8780bf..414f039b35f1 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -6095,6 +6095,7 @@ int load_module_symbols_helper(char *);
>  void unlink_module(struct load_module *);
>  int check_specified_module_tree(char *, char *);
>  int is_system_call(char *, ulong);
> +void get_dumpfile_regs(struct bt_info*, ulong*, ulong*);
>  void generic_dump_irq(int);
>  void generic_get_irq_affinity(int);
>  void generic_show_interrupts(int, ulong *);
> @@ -6194,6 +6195,7 @@ ulong cpu_map_addr(const char *type);
>  #define BT_REGS_NOT_FOUND (0x4000000000000ULL)
>  #define BT_OVERFLOW_STACK (0x8000000000000ULL)
>  #define BT_SKIP_IDLE     (0x10000000000000ULL)
> +#define BT_NO_PRINT_REGS (0x20000000000000ULL)
>  #define BT_SYMBOL_OFFSET   (BT_SYMBOLIC_ARGS)
>  
>  #define BT_REF_HEXVAL         (0x1)
> @@ -8050,4 +8052,124 @@ enum x86_64_regnum {
>          LAST_REGNUM
>  };
>  
> +/*
> + * Register numbers to make crash_target->fetch_registers()
> + * ---> machdep->get_cpu_reg() work properly.
> + *
> + *  These register numbers and names are given according to output of
> + *  `rs6000_register_name`, because that is what was being used by
> + *  crash_target::fetch_registers in case of PPC64
> + */
> +enum ppc64_regnum {
> +	PPC64_R0_REGNUM = 0,
> +	PPC64_R1_REGNUM,
> +	PPC64_R2_REGNUM,
> +	PPC64_R3_REGNUM,
> +	PPC64_R4_REGNUM,
> +	PPC64_R5_REGNUM,
> +	PPC64_R6_REGNUM,
> +	PPC64_R7_REGNUM,
> +	PPC64_R8_REGNUM,
> +	PPC64_R9_REGNUM,
> +	PPC64_R10_REGNUM,
> +	PPC64_R11_REGNUM,
> +	PPC64_R12_REGNUM,
> +	PPC64_R13_REGNUM,
> +	PPC64_R14_REGNUM,
> +	PPC64_R15_REGNUM,
> +	PPC64_R16_REGNUM,
> +	PPC64_R17_REGNUM,
> +	PPC64_R18_REGNUM,
> +	PPC64_R19_REGNUM,
> +	PPC64_R20_REGNUM,
> +	PPC64_R21_REGNUM,
> +	PPC64_R22_REGNUM,
> +	PPC64_R23_REGNUM,
> +	PPC64_R24_REGNUM,
> +	PPC64_R25_REGNUM,
> +	PPC64_R26_REGNUM,
> +	PPC64_R27_REGNUM,
> +	PPC64_R28_REGNUM,
> +	PPC64_R29_REGNUM,
> +	PPC64_R30_REGNUM,
> +	PPC64_R31_REGNUM,
> +
> +	PPC64_F0_REGNUM = 32,
> +	PPC64_F1_REGNUM,
> +	PPC64_F2_REGNUM,
> +	PPC64_F3_REGNUM,
> +	PPC64_F4_REGNUM,
> +	PPC64_F5_REGNUM,
> +	PPC64_F6_REGNUM,
> +	PPC64_F7_REGNUM,
> +	PPC64_F8_REGNUM,
> +	PPC64_F9_REGNUM,
> +	PPC64_F10_REGNUM,
> +	PPC64_F11_REGNUM,
> +	PPC64_F12_REGNUM,
> +	PPC64_F13_REGNUM,
> +	PPC64_F14_REGNUM,
> +	PPC64_F15_REGNUM,
> +	PPC64_F16_REGNUM,
> +	PPC64_F17_REGNUM,
> +	PPC64_F18_REGNUM,
> +	PPC64_F19_REGNUM,
> +	PPC64_F20_REGNUM,
> +	PPC64_F21_REGNUM,
> +	PPC64_F22_REGNUM,
> +	PPC64_F23_REGNUM,
> +	PPC64_F24_REGNUM,
> +	PPC64_F25_REGNUM,
> +	PPC64_F26_REGNUM,
> +	PPC64_F27_REGNUM,
> +	PPC64_F28_REGNUM,
> +	PPC64_F29_REGNUM,
> +	PPC64_F30_REGNUM,
> +	PPC64_F31_REGNUM,
> +
> +	PPC64_PC_REGNUM = 64,
> +	PPC64_MSR_REGNUM = 65,
> +	PPC64_CR_REGNUM = 66,
> +	PPC64_LR_REGNUM = 67,
> +	PPC64_CTR_REGNUM = 68,
> +	PPC64_XER_REGNUM = 69,
> +	PPC64_FPSCR_REGNUM = 70,
> +
> +	PPC64_VR0_REGNUM = 106,
> +	PPC64_VR1_REGNUM,
> +	PPC64_VR2_REGNUM,
> +	PPC64_VR3_REGNUM,
> +	PPC64_VR4_REGNUM,
> +	PPC64_VR5_REGNUM,
> +	PPC64_VR6_REGNUM,
> +	PPC64_VR7_REGNUM,
> +	PPC64_VR8_REGNUM,
> +	PPC64_VR9_REGNUM,
> +	PPC64_VR10_REGNUM,
> +	PPC64_VR11_REGNUM,
> +	PPC64_VR12_REGNUM,
> +	PPC64_VR13_REGNUM,
> +	PPC64_VR14_REGNUM,
> +	PPC64_VR15_REGNUM,
> +	PPC64_VR16_REGNUM,
> +	PPC64_VR17_REGNUM,
> +	PPC64_VR18_REGNUM,
> +	PPC64_VR19_REGNUM,
> +	PPC64_VR20_REGNUM,
> +	PPC64_VR21_REGNUM,
> +	PPC64_VR22_REGNUM,
> +	PPC64_VR23_REGNUM,
> +	PPC64_VR24_REGNUM,
> +	PPC64_VR25_REGNUM,
> +	PPC64_VR26_REGNUM,
> +	PPC64_VR27_REGNUM,
> +	PPC64_VR28_REGNUM,
> +	PPC64_VR29_REGNUM,
> +	PPC64_VR30_REGNUM,
> +	PPC64_VR31_REGNUM,
> +
> +	PPC64_VSCR_REGNUM = 138,
> +	PPC64_VRSAVE_REGNU = 139
> +};
> +
>  #endif /* !GDB_COMMON */
> diff --git a/kernel.c b/kernel.c
> index 1728b70c1b5c..661ef0e237eb 100644
> --- a/kernel.c
> +++ b/kernel.c
> @@ -3533,6 +3533,39 @@ get_lkcd_regs(struct bt_info *bt, ulong *eip, ulong *esp)
>  	machdep->get_stack_frame(bt, eip, esp);
>  }
>  
> +void
> +get_dumpfile_regs(struct bt_info *bt, ulong *eip, ulong *esp)
> +{
> +	bt->flags |= BT_NO_PRINT_REGS;
> +
> +	if (NETDUMP_DUMPFILE())
> +		get_netdump_regs(bt, eip, esp);
> +	else if (KDUMP_DUMPFILE())
> +		get_kdump_regs(bt, eip, esp);
> +	else if (DISKDUMP_DUMPFILE())
> +		get_diskdump_regs(bt, eip, esp);
> +	else if (KVMDUMP_DUMPFILE())
> +		get_kvmdump_regs(bt, eip, esp);
> +	else if (LKCD_DUMPFILE())
> +		get_lkcd_regs(bt, eip, esp);
> +	else if (XENDUMP_DUMPFILE())
> +		get_xendump_regs(bt, eip, esp);
> +	else if (SADUMP_DUMPFILE())
> +		get_sadump_regs(bt, eip, esp);
> +	else if (VMSS_DUMPFILE())
> +		get_vmware_vmss_regs(bt, eip, esp);
> +	else if (REMOTE_PAUSED()) {
> +		if (!is_task_active(bt->task) || !get_remote_regs(bt, eip, esp))
> +			machdep->get_stack_frame(bt, eip, esp);
> +	} else
> +		machdep->get_stack_frame(bt, eip, esp);
> +
> +	bt->flags &= ~BT_NO_PRINT_REGS;
> +
> +	bt->instptr = *eip;
> +	bt->stkptr = *esp;
> +}
> +
>  
>  /*
>   *  Store the head of the kernel module list for future use.
> diff --git a/ppc64.c b/ppc64.c
> index e8930a139e0d..870b0fbb49ce 100644
> --- a/ppc64.c
> +++ b/ppc64.c
> @@ -55,6 +55,8 @@ static void ppc64_set_bt_emergency_stack(enum emergency_stack_type type,
>  static char * ppc64_check_eframe(struct ppc64_pt_regs *);
>  static void ppc64_print_eframe(char *, struct ppc64_pt_regs *, 
>  		struct bt_info *);
> +static int ppc64_get_cpu_reg(int cpu, int regno, const char *name, int size,
> +		  void *value);
>  static void parse_cmdline_args(void);
>  static int ppc64_paca_percpu_offset_init(int);
>  static void ppc64_init_cpu_info(void);
> @@ -704,6 +706,8 @@ ppc64_init(int when)
>  				error(FATAL, "cannot malloc hwirqstack buffer space.");
>  		}
>  
> +		machdep->get_cpu_reg = ppc64_get_cpu_reg;
> +
>  		ppc64_init_paca_info();
>  
>  		if (!machdep->hz) {
> @@ -2501,6 +2505,105 @@ ppc64_print_eframe(char *efrm_str, struct ppc64_pt_regs *regs,
>  	ppc64_print_nip_lr(regs, 1);
>  }
>  
> +static int
> +ppc64_get_cpu_reg(int cpu, int regno, const char *name, int size,
> +		  void *value)
> +{
> +	struct bt_info bt_info, bt_setup;
> +	struct task_context *tc;
> +	ulong task;
> +	struct ppc64_pt_regs *pt_regs;
> +	ulong ip, sp;
> +
> +	if (LIVE()) {
> +		/* doesn't support reading registers in live dump */
> +		return FALSE;
> +	}

What do you think if we remove the LIVE() check for v10? According to my test(with my x86_64 patchsets), with ppc64 arbitrary task stack unwinding support, we can view the stack unwinding for non-active tasks for live debug:

crash> sys
      KERNEL: /usr/lib/debug/lib/modules/5.14.0-425.el9.ppc64le/vmlinux
    DUMPFILE: /proc/kcore
        CPUS: 8
        DATE: Tue Feb 27 04:19:36 EST 2024
      UPTIME: 01:11:56
LOAD AVERAGE: 0.26, 0.09, 1.70
       TASKS: 196
    NODENAME: ibm-p9z-26-lp13.virt.pnr.lab.eng.rdu2.redhat.com
     RELEASE: 5.14.0-425.el9.ppc64le
     VERSION: #1 SMP Wed Feb 21 15:29:04 EST 2024
     MACHINE: ppc64le  (2900 Mhz)
      MEMORY: 8 GB
crash> set 1
    PID: 1
COMMAND: "systemd"
   TASK: c0000000035fc900  [THREAD_INFO: c0000000035fc900]
    CPU: 1
  STATE: TASK_INTERRUPTIBLE
crash> bt
PID: 1        TASK: c0000000035fc900  CPU: 1    COMMAND: "systemd"
 #0 [c00000000369fa60] __schedule at c000000000fc3c58
 #1 [c00000000369fb20] schedule at c000000000fc411c
 #2 [c00000000369fb50] schedule_hrtimeout_range_clock at c000000000fcd2a4
 #3 [c00000000369fc00] ep_poll at c00000000063640c
 #4 [c00000000369fcf0] do_epoll_wait at c000000000636584
 #5 [c00000000369fd40] sys_epoll_wait at c000000000636608
 #6 [c00000000369fdb0] system_call_exception at c00000000002e994
 #7 [c00000000369fe10] system_call_vectored_common at c00000000000bfe8
crash> gdb bt
#0  0xc000000000fc3c58 in context_switch (rf=0xc00000000369fac0, next=0x0, prev=0x0, rq=0x0) at kernel/sched/core.c:5409
#1  __schedule (sched_mode=sched_mode@entry=0) at kernel/sched/core.c:6737
#2  0xc000000000fc411c in schedule_loop (sched_mode=<optimized out>) at kernel/sched/core.c:6807
#3  schedule () at kernel/sched/core.c:6816
#4  0xc000000000fcd2a4 in schedule_hrtimeout_range_clock (expires=<optimized out>, delta=<optimized out>, mode=<optimized out>, clock_id=<optimized out>) at kernel/time/hrtimer.c:2297
#5  0xc00000000063640c in ep_poll (ep=0xc000000034b32e80, events=0x0, events@entry=0x1001d23a9e0, maxevents=maxevents@entry=82, timeout=timeout@entry=0x0) at fs/eventpoll.c:1904
#6  0xc000000000636584 in do_epoll_wait (epfd=epfd@entry=4, events=events@entry=0x1001d23a9e0, maxevents=maxevents@entry=82, to=0x0) at fs/eventpoll.c:2288
#7  0xc000000000636608 in __do_sys_epoll_wait (timeout=<optimized out>, maxevents=82, events=0x1001d23a9e0, epfd=4) at fs/eventpoll.c:2300
#8  __se_sys_epoll_wait (epfd=4, events=1100000504288, maxevents=82, timeout=<optimized out>) at fs/eventpoll.c:2295
#9  0xc00000000002e994 in system_call_exception (r3=4, r4=1100000504288, r5=82, r6=-1, r7=140735513318112, r8=140735513287592, r0=<optimized out>, regs=0xc00000000369fe80) at arch/powerpc/kernel/interrupt.c:221
#10 0xc00000000000bfe8 in system_call_vectored_common () at arch/powerpc/kernel/interrupt_64.S:209
Backtrace stopped: previous frame inner to this frame (corrupt stack?)

However for those active tasks in live mode, stack unwind will fail, but I think it doesn't matter because "bt" and "gdb bt" both cannot show stacks:

crash> ps
      PID    PPID  CPU       TASK        ST  %MEM      VSZ      RSS  COMM
        0       0   0  c000000002af6380  RU   0.0        0        0  [swapper/0]
>       0       0   1  c0000000035f9000  RU   0.0        0        0  [swapper/1]
>       0       0   2  c0000000035f0180  RU   0.0        0        0  [swapper/2]
>       0       0   3  c0000000035c5580  RU   0.0        0        0  [swapper/3]
>       0       0   4  c0000000035fac80  RU   0.0        0        0  [swapper/4]
>       0       0   5  c0000000035c7200  RU   0.0        0        0  [swapper/5]
>       0       0   6  c0000000035c0000  RU   0.0        0        0  [swapper/6]
>       0       0   7  c0000000035dc800  RU   0.0        0        0  [swapper/7]
        1       0   1  c0000000035fc900  IN   0.2   182272    14016  systemd
crash> set c0000000035f0180
    PID: 0
COMMAND: "swapper/2"
   TASK: c0000000035f0180  (1 of 8)  [THREAD_INFO: c0000000035f0180]
    CPU: 2
  STATE: TASK_RUNNING (ACTIVE)
crash> bt
PID: 0        TASK: c0000000035f0180  CPU: 2    COMMAND: "swapper/2"
(active)
crash> gdb bt
#0  0xc000000003847d50 in ?? ()
gdb: invalid kernel virtual address: fffffffffffffffc  type: "gdb_readmem callback"
gdb: invalid kernel virtual address: fffffffffffffff8  type: "gdb_readmem callback"
gdb: invalid kernel virtual address: fffffffffffffffc  type: "gdb_readmem callback"
gdb: invalid kernel virtual address: fffffffffffffff8  type: "gdb_readmem callback"
#1  0x0000000000000000 in ?? ()

What do you think?

Thanks,
Tao Liu
> +
> +	/* Currently only handling registers available in ppc64_pt_regs:
> +	 *
> +	 * 0-31:   r0-r31
> +	 * 64:     pc/nip
> +	 * 65:     msr
> +	 *
> +	 * 67:     lr
> +	 * 68:     ctr
> +	 */
> +	switch (regno) {
> +	case PPC64_R0_REGNUM ... PPC64_R31_REGNUM:
> +
> +	case PPC64_PC_REGNUM:
> +	case PPC64_MSR_REGNUM:
> +	case PPC64_LR_REGNUM:
> +	case PPC64_CTR_REGNUM:
> +		break;
> +
> +	default:
> +		// return false if we can't get that register
> +		if (CRASHDEBUG(1))
> +			error(WARNING, "unsupported register, regno=%d\n", regno);
> +		return FALSE;
> +	}
> +
> +	task = get_active_task(cpu);
> +	tc = task_to_context(task);
> +	BZERO(&bt_setup, sizeof(struct bt_info));
> +	clone_bt_info(&bt_setup, &bt_info, tc);
> +	fill_stackbuf(&bt_info);
> +
> +	// reusing the get_dumpfile_regs function to get pt regs structure
> +	get_dumpfile_regs(&bt_info, &sp, &ip);
> +	pt_regs = (struct ppc64_pt_regs *)bt_info.machdep;
> +
> +	if (!pt_regs) {
> +		error(WARNING, "pt_regs not available for cpu %d\n", cpu);
> +		return FALSE;
> +	}
> +
> +	switch (regno) {
> +	case PPC64_R0_REGNUM ... PPC64_R31_REGNUM:
> +		if (size != sizeof(pt_regs->gpr[regno]))
> +			return FALSE;  // size mismatch
> +
> +		memcpy(value, &pt_regs->gpr[regno], size);
> +		break;
> +
> +	case PPC64_PC_REGNUM:
> +		if (size != sizeof(pt_regs->nip))
> +			return FALSE;  // size mismatch
> +
> +		memcpy(value, &pt_regs->nip, size);
> +		break;
> +
> +	case PPC64_MSR_REGNUM:
> +		if (size != sizeof(pt_regs->msr))
> +			return FALSE;  // size mismatch
> +
> +		memcpy(value, &pt_regs->msr, size);
> +		break;
> +
> +	case PPC64_LR_REGNUM:
> +		if (size != sizeof(pt_regs->link))
> +			return FALSE;  // size mismatch
> +
> +		memcpy(value, &pt_regs->link, size);
> +		break;
> +
> +	case PPC64_CTR_REGNUM:
> +		if (size != sizeof(pt_regs->ctr))
> +			return FALSE;  // size mismatch
> +
> +		memcpy(value, &pt_regs->ctr, size);
> +		break;
> +	}
> +
> +	/* free buffer allocated by fill_stackbuf */
> +	if (bt_info.stackbuf)
> +		FREEBUF(bt_info.stackbuf);
> +
> +	return TRUE;
> +}
> +
>  /*
>   * For vmcore typically saved with KDump or FADump, get SP and IP values
>   * from the saved ptregs.
> @@ -2613,9 +2716,11 @@ ppc64_get_dumpfile_stack_frame(struct bt_info *bt_in, ulong *nip, ulong *ksp)
>  		pt_regs = (struct ppc64_pt_regs *)bt->machdep;
>  		ur_nip = pt_regs->nip;
>  		ur_ksp = pt_regs->gpr[1];
> -		/* Print the collected regs for panic task. */
> -		ppc64_print_regs(pt_regs);
> -		ppc64_print_nip_lr(pt_regs, 1);
> +		if (!(bt->flags & BT_NO_PRINT_REGS)) {
> +			/* Print the collected regs for panic task. */
> +			ppc64_print_regs(pt_regs);
> +			ppc64_print_nip_lr(pt_regs, 1);
> +		}
>  	} else if ((pc->flags & KDUMP) ||
>  		   ((pc->flags & DISKDUMP) &&
>  		    (*diskdump_flags & KDUMP_CMPRS_LOCAL))) {
> -- 
> 2.43.0
> 
--
Crash-utility mailing list -- devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx
To unsubscribe send an email to devel-leave@xxxxxxxxxxxxxxxxxxxxxxxxxxx
https://${domain_name}/admin/lists/devel.lists.crash-utility.osci.io/
Contribution Guidelines: https://github.com/crash-utility/crash/wiki