[Crash-utility] Re: [PATCH v1 1/3] ppc64: Fix bt printing error stack trace

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Tao,


On 04/10/24 10:40, Tao Lou wrote:
Hi Aditya & lianbo,

On Tue, Sep 24, 2024 at 2:28 AM Aditya Gupta <adityag@xxxxxxxxxxxxx> wrote:
Hi Tao,

On 24/09/22 10:25PM, Aditya Gupta wrote:
Hi Tao,


On 18/09/24 05:12, Tao Liu wrote:
A error stack trace of bt cmd observed:

crash> bt 1
PID: 1        TASK: c000000003714b80  CPU: 2    COMMAND: "systemd"
   #0 [c0000000037735c0] _end at c0000000037154b0  (unreliable)
   #1 [c000000003773770] __switch_to at c00000000001fa9c
   #2 [c0000000037737d0] __schedule at c00000000112e4ec
   #3 [c0000000037738b0] schedule at c00000000112ea80
   ...

The #0 stack trace is incorrect, the function address shouldn't exceed _end.
The reason is for kernel>=v6.2, the offset of pt_regs to sp changed from
STACK_FRAME_OVERHEAD, i.e 112, to STACK_SWITCH_FRAME_REGS. For
CONFIG_PPC64_ELF_ABI_V1, it's 112, for ABI_V2, it's 48. So the nip will read a
wrong value from stack when ABI_V2 enabled.

To determine if ABI_V2 enabled is tricky. This patch do it by check the
following:

In arch/powerpc/include/asm/ppc_asm.h:
    #ifdef CONFIG_PPC64_ELF_ABI_V2
    #define STK_GOT          24
    #else
    #define STK_GOT          40

In arch/powerpc/kernel/tm.S:
    _GLOBAL(tm_reclaim)
     mfcr    r5
     mflr    r0
     stw     r5, 8(r1)
     std     r0, 16(r1)
     std     r2, STK_GOT(r1)
     ...

So a disassemble on tm_reclaim, and extract the STK_GOT value from std
instruction is used as the approach.

After the patch:
crash> bt 1
PID: 1        TASK: c000000003714b80  CPU: 2    COMMAND: "systemd"
   #0 [c0000000037737d0] __schedule at c00000000112e4ec
   #1 [c0000000037738b0] schedule at c00000000112ea80
   ...

Signed-off-by: Tao Liu <ltao@xxxxxxxxxx>
---
   defs.h  |  1 +
   ppc64.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
   2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/defs.h b/defs.h
index 2231cb6..d5cb8cc 100644
--- a/defs.h
+++ b/defs.h
@@ -4643,6 +4643,7 @@ struct efi_memory_desc_t {
   #define MSR_PR_LG 14      /* Problem State / Privilege Level */
                             /* Used to find the user or kernel-mode frame*/
+#define STACK_SWITCH_FRAME_REGS         48
   #define STACK_FRAME_OVERHEAD            112
   #define EXCP_FRAME_MARKER               0x7265677368657265
diff --git a/ppc64.c b/ppc64.c
index e8930a1..6e5f155 100644
--- a/ppc64.c
+++ b/ppc64.c
@@ -72,6 +72,7 @@ static ulong pud_page_vaddr_l4(ulong pud);
   static ulong pmd_page_vaddr_l4(ulong pmd);
   static int is_opal_context(ulong sp, ulong nip);
   void opalmsg(void);
+static bool is_ppc64_elf_abi_v2(void);
   static int is_opal_context(ulong sp, ulong nip)
   {
@@ -2813,6 +2814,51 @@ ppc64_get_sp(ulong task)
     return sp;
   }
+static bool
+is_ppc64_elf_abi_v2(void)
+{

Thanks for fixing these issues on ppc64 !


It is good. I am also checking if there is a way to implement
'is_ppc64_elf_abi_v2' by getting ABI version from the ELF notes, as 'file'
command somehow does know:

$ file vmlinux-little-endian vmlinux-little-endian: ELF 64-bit LSB
executable, 64-bit PowerPC or cisco 7500, OpenPOWER ELF V2 ABI, version 1
(SYSV), statically linked,
BuildID[sha1]=f6449970e6af6cddeed1e260393e2377ccc0f369, not stripped

That might be more dependable (if we can do that).
How about this approach ?

I feel this might be more dependable, as the file command also uses
this, as well as fadump (PowerPC firmware-assisted dump) in the kernel.

As a sidenote, there is still some issue either in patch #1 or #3
causing all secondary CPUs (where swapper task is running) to show all
threads as '_end'
What do you think ?
I think your approach will work, thanks for your code! Please see my
comments inlined in the code:

Either way what you chose, I do realise the idea to utilise the
'std r2, STK_GOT(r1)' in such a way, was interesting and smart.

     static bool
     is_ppc64_elf_abi_v2(void)
     {
         const char *kernel_file = pc->namelist;
         int fd, swap;
         char eheader[BUFSIZE];
         Elf64_Ehdr *elf64;

         if ((fd = open(kernel_file, O_RDONLY)) < 0) {
                 error(INFO, "%s: %s\n", kernel_file, strerror(errno));
                 return FALSE;
         }
         if (read(fd, eheader, BUFSIZE) != BUFSIZE) {
             error(INFO, "%s: %s\n", kernel_file, strerror(errno));
                 close(fd);
                 return FALSE;
         }
         close(fd);

         if (!STRNEQ(eheader, ELFMAG) || eheader[EI_VERSION] != EV_CURRENT)
                 return FALSE;

         elf64 = (Elf64_Ehdr *)&eheader[0];

         swap = (((eheader[EI_DATA] == ELFDATA2LSB) &&
              (__BYTE_ORDER == __BIG_ENDIAN)) ||
             ((eheader[EI_DATA] == ELFDATA2MSB) &&
              (__BYTE_ORDER == __LITTLE_ENDIAN)));
The above code seems duplicated with the one in is_kernel(), in fact,
is_kernel() will be invoked at the very beginning to determine if one
file is vmlinux, so I would suggest to add our abi_v2 check here,
like:

in ppc64.c:
int ppc64_elf_abi = 0;

static bool is_ppc64_elf_abi_v2(void)
{
   if (ppc64_elf_abi == 1)
...
   else if (ppc64_elf_abi == 2)
...
   else
...
}

in symbols.c:is_kernel():
   ...
   } else if ((elf64->e_ident[EI_CLASS] == ELFCLASS64) &&
     ((swap16(elf64->e_type, swap) == ET_EXEC) ||
   ...
       case EM_PPC64:
         if (machine_type_mismatch(file, "PPC64", NULL, 0))
           goto bailout;
+       else
+         ppc64_elf_abi = swap32(elf64->e_flags, swap);
         break;

So we can initial the abi version at crash start only once, and get
rid of the code duplication.

Yes, I actually copied pasted the initial part from is_kernel.

The approach is good,  just to clarify, we know that is_kernel will be called everytime before this check right ?


Thanks,
Aditya Gupta


         /*
          * The ABI version is stored in E_FLAGS in the ELF header, atleast for
          * ppc64 binaries.
          *
          * These logic is also used by the 'file' command, which states this in
          * its magic file:
          *
          *     >18      leshort         20              PowerPC or cisco 4500,
          *     >18      leshort         21              64-bit PowerPC or cisco 7500,
          *     >>48     lelong          0               Unspecified or Power ELF V1 ABI,
          *     >>48     lelong          1               Power ELF V1 ABI,
          *     >>48     lelong          2               OpenPOWER ELF V2 ABI,
          *
          * The '>18' above means, data at 18 (0x12), which is basically
          * 'elf64->e_machine', which will be 20 for PPC, and 21 for PPC64
          *
          * Similarly, '>>48' is offset 48 (0x30) in the file, which is
          * basically 'elf64->e_flags', which has the ELF ABI version
          */
         if (elf64->e_flags == 2) {
                 /* ABI v2 */
                 return TRUE;
         } else if (elf64->e_flags == 1) {
                 /* ABI v1 */
                 return FALSE;
         }

         error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
         return FALSE;
     }

Also, we can those values in the ELF header:

     $ hexdump vmlinux-little-endian
     0000000 457f 464c 0102 0001 0000 0000 0000 0000
     0000010 0002 0015 (= 21 = E_MACHINE)  0001 0000 0000 0000 0000 c000
     0000020 0040 0000 0000 0000 8bf8 0040 0000 0000
     0000030 0002 (= E_FLAGS = ABI)   0000 0040 0038 0002 0040 0027 0026
     0000040 0001 0000 0007 0000 0000 0001 0000 0000

I agree with the elf header check on the vmlinux file, it is much
simpler. Also I compiled the kernel with CONFIG_PPC64_ELF_ABI_V1 on,
the file cmd will give: Power ELF V1 ABI string. So the approach can
work.

Also, I see 'pc->namelist' is basically the kernel filename, is that
correct ? Can that be renamed to kernel_filename or something ?

Yes, by reading through the code, pc->namelist is vmlinux, and
pc->namelist_debug is the debug symbol if vmlinux doesn't have the
symbol. But I didn't see the need for the rename.

Thanks,
Tao Liu

Thanks,
Aditya Gupta

Thanks,

Aditya Gupta

+   char buf1[BUFSIZE];
+   char *pos1, *pos2;
+   int errflag = 0;
+   ulong stk_got = 0;
+   static bool ret = false;
+   static bool checked = false;
+
+   if (checked == true || !symbol_exists("tm_reclaim"))
+           return ret;
+
+   sprintf(buf1, "x/16i tm_reclaim");
+   open_tmpfile();
+   if (!gdb_pass_through(buf1, pc->tmpfile, GNU_RETURN_ON_ERROR))
+           goto out;
+   checked = true;
+   rewind(pc->tmpfile);
+   while (fgets(buf1, BUFSIZE, pc->tmpfile)) {
+           // "std r2, STK_GOT(r1)" is expected
+           if (strstr(buf1, "std") &&
+               strstr(buf1, "(r1)") &&
+               (pos1 = strstr(buf1, "r2,"))) {
+                   pos1 += strlen("r2,");
+                   for (pos2 = pos1; *pos2 != '\0' && *pos2 != '('; pos2++);
+                   *pos2 = '\0';
+                   stk_got = stol(pos1, RETURN_ON_ERROR|QUIET, &errflag);
+                   break;
+           }
+   }
+
+   if (!errflag) {
+           switch (stk_got) {
+           case 24:
+                   ret = true;
+           case 40:
+                   goto out;
+           }
+   }
+   error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
+out:
+   close_tmpfile();
+   return ret;
+}
   /*
    *  get the SP and PC values for idle tasks.
@@ -2834,9 +2880,17 @@ get_ppc64_frame(struct bt_info *bt, ulong *getpc, ulong *getsp)
     sp = ppc64_get_sp(task);
     if (!INSTACK(sp, bt))
             goto out;
-   readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, &regs,
-           sizeof(struct ppc64_pt_regs),
-           "PPC64 pt_regs", FAULT_ON_ERROR);
+
+   if (THIS_KERNEL_VERSION >= LINUX(6,2,0) && is_ppc64_elf_abi_v2()) {
+           readmem(sp+STACK_SWITCH_FRAME_REGS, KVADDR, &regs,
+                   sizeof(struct ppc64_pt_regs),
+                   "PPC64 pt_regs", FAULT_ON_ERROR);
+   } else {
+           readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, &regs,
+                   sizeof(struct ppc64_pt_regs),
+                   "PPC64 pt_regs", FAULT_ON_ERROR);
+   }
+
     ip = regs.nip;
     closest = closest_symbol(ip);
     if (STREQ(closest, ".__switch_to") || STREQ(closest, "__switch_to")) {
--
Crash-utility mailing list -- devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx
To unsubscribe send an email to devel-leave@xxxxxxxxxxxxxxxxxxxxxxxxxxx
https://${domain_name}/admin/lists/devel.lists.crash-utility.osci.io/
Contribution Guidelines: https://github.com/crash-utility/crash/wiki




[Index of Archives]     [Fedora Development]     [Fedora Desktop]     [Fedora SELinux]     [Yosemite News]     [KDE Users]     [Fedora Tools]

 

Powered by Linux