[Crash-utility] Re: [PATCH] x86_64: rhel 9.3: "crash" doesn't handle all IRQ exception properly

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Thank you for the explanation, Aureau.

On Wed, Apr 10, 2024 at 9:48 PM Aureau, Georges (Kernel Tools ERT) <georges.aureau@xxxxxxx> wrote:
Hello Lianbo,

In my case, the fix you suggested works properly as I've got a single CPU with an IRQ exception in a SYSVEC handler.
But it we have multiple CPUs with different IRQ exceptions, then your suggestion does not work as
x86_64_irq_eframe_link() is changing irq_eframe_link, for example:

x86_64_irq_eframe_link_init():
    set default irq_eframe_link=-24 since we have sysvec_apic_timer_interrupt

Then if we do "bt -c cpu" on several cpu X, Y, X, being of their irq_stack:

x86_64_irq_eframe_link()/cpu X/rip:sysvec_apic_timer_interrupt:
    default irq_eframe_link=-24 is matching -framesize(rip:sysvec_apic_timer_interrupt)
    so we are ok

x86_64_irq_eframe_link()/cpu Y/rip:common_interrupt:
    -24 does not work as framesize(common_interrupt)=32
    we try -24-8 = -32, it works, and we set irq_eframe_link=-32

x86_64_irq_eframe_link()/cpu Z/rip:sysvec_apic_timer_interrupt:
    -32 does not work as framesize(rip:sysvec_apic_timer_interrupt)=24
    we try -32-8 = -40, it does not work, and we are back to square one !
    here we should try  -32+8 = -24

We would need to have "bt -c cpu" doing runtime per cpu selection between -32 and -24 depending on framesize(*(pcpu_hot.hardirq_stack_ptr-8)).
So we can't really see "irq_eframe_link" as being a value valid for all cpu's / IRQ.
The "irq_eframe_link" should be per cpu, depending of what each cpu is executing (more below).

Currently x86_64_irq_eframe_link() tries "stkref", then it tries "stkref-irq_eframe_link" (with irq_eframe_link set in irq_eframe_link_init()),
and then it tries "stkref-irq_eframe_link+8":

static ulong
x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE *ofp)
{
        ulong irq_eframe;

        if (x86_64_exception_frame(EFRAME_VERIFY, stkref, 0, bt, ofp))
                return stkref;

        irq_eframe = stkref - machdep->machspec->irq_eframe_link;

        if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe, 0, bt, ofp))
                return irq_eframe;

        if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+8, 0, bt, ofp)) {
                machdep->machspec->irq_eframe_link -= 8;
                return (irq_eframe + 8);
        }

        return irq_eframe;
}


I don't have the history as why we try "stkref-irq_eframe_link+8" ? Any idea ?


Me too. But it could be similar to your handling in the current patch. Just a guess.

But this current design with defaulting irq_eframe_link to -32 (or -24 as per your suggestion) does not always work.

We would need to handle both irq_stack rip's from IRQ functions  (eg. common_interrupt):

#define DEFINE_IDTENTRY_IRQ(func)                                       \
static void __##func(struct pt_regs *regs, u32 vector);                 \
                                                                        \
__visible noinstr void func(struct pt_regs *regs,                       \
                            unsigned long error_code)                   \
{                                                                       \
        irqentry_state_t state = irqentry_enter(regs);                  \
        u32 vector = (u32)(u8)error_code;                               \
                                                                        \
        instrumentation_begin();                                        \
        kvm_set_cpu_l1tf_flush_l1d();                                   \
        run_irq_on_irqstack_cond(__##func, regs, vector);               \
        instrumentation_end();                                          \
        irqentry_exit(regs, state);                                     \
}                                                                       \
                                                                        \
static noinline void __##func(struct pt_regs *regs, u32 vector)


And also to handle irq_stack rip's from SYSVEC functions  (eg. sysvec_apic_timer_interrupt):

#define DEFINE_IDTENTRY_SYSVEC(func)                                    \
static void __##func(struct pt_regs *regs);                             \
                                                                        \
__visible noinstr void func(struct pt_regs *regs)                       \
{                                                                       \
        irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        kvm_set_cpu_l1tf_flush_l1d();                                   \
        run_sysvec_on_irqstack_cond(__##func, regs);                    \
        instrumentation_end();                                          \
        irqentry_exit(regs, state);                                     \
}                                                                       \
                                                                        \
static noinline void __##func(struct pt_regs *regs) Other changes are required.


Clearly we should be adjusting "irq_eframe_link" based on framesize for either DEFINE_IDTENTRY_IRQ() functions or DEFINE_IDTENTRY_SYSVEC() functions.
Maybe we should have  "irq_eframe_link" for DEFINE_IDTENTRY_IRQ() functions, and another "sysvec_eframe_link" for DEFINE_IDTENTRY_SYSVEC() functions, then try both links.
Anyhow, the current x86_64_irq_eframe_link_init() code is making assumption on a single common framesize , and actually there is a couple of assumptions:
- assuming framesizes with hardcoded values
- assuming a single common framesize

To keep backward compatibility with the current code, which is trying +8 (not sure why), the patch I proposed was keeping the existing +8, but it is also trying -8, and also +16 and -16,
as to try having more fallbacks in case we didn't guess the proper framesizes.

 
Seems no better way for the time being, let's go with this.

Thanks
Lianbo
 
Regards,
Georges



-----Original Message-----
From: Lianbo Jiang <lijiang@xxxxxxxxxx>
Sent: Wednesday, April 10, 2024 6:11 AM
To: devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx
Cc: Aureau, Georges (Kernel Tools ERT) <georges.aureau@xxxxxxx>
Subject: Re: [PATCH] x86_64: rhel 9.3: "crash" doesn't handle all IRQ exception properly

Hi, Aureau

Thank you for the fix.

On 4/5/24 08:38, devel-request@xxxxxxxxxxxxxxxxxxxxxxxxxxx wrote:
> Date: Thu, 4 Apr 2024 14:39:06 +0000
> From: "Aureau, Georges (Kernel Tools ERT)"<georges.aureau@xxxxxxx>
> Subject: [PATCH] x86_64: rhel 9.3: "crash" doesn't
>       handle all IRQ exception properly.
> To:"devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx"
>       <devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx>
> Message-ID: <SJ0PR84MB14821A83D9631672B4F0F9449F3C2@xxxxxxxxxxxxxxxxxx
>       RD84.PROD.OUTLOOK.COM>
> Content-Type: text/plain; charset="us-ascii"
>
>
> Hello,
>
> On x86_64, with rhel 9.3, there are cases where "crash" can't handle IRQ exception frames properly.
>
> For example, "crash" fails with "WARNING: possibly bogus exception frame":
>
> crash> bt -c 30
> PID: 2898241  TASK: ff4cb0ce0da0c680  CPU: 30   COMMAND: "star-ccm+"
>   #0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
>   #1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
>   #2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
>   #3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
>   #4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
>      [exception RIP: native_queued_spin_lock_slowpath+636]
>      RIP: ffffffffa0c6b80c  RSP: ff5eba269937ce10  RFLAGS: 00000046
>      RAX: 0000000000000000  RBX: ff4cb12bcfbb2940  RCX: 00000000007c0000
>      RDX: 0000000000000057  RSI: 0000000001600000  RDI: ff4cb12d67205608
>      RBP: ff4cb12d67205608   R8: ff90b9c682475940   R9: 0000000000000000
>      R10: ff4cb0d70ee4e100  R11: 0000000000000000  R12: 0000000000000000
>      R13: 000000000000001e  R14: ff90b9c682474918  R15: ff90b9c682477120
>      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
> --- <NMI exception stack> ---
>   #5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
>   #6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
>   #7 [ff5eba269937ce40] free_iova at ffffffffa07df716
>   #8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
>   #9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
> #10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
> #11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
> #12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
> #13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
> #14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
> #15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
> --- <IRQ stack> ---
>      RIP: 0000000000000010  RSP: 0000000000000018  RFLAGS: ff5eba26ddc9f7e8
>      RAX: 0000000000000a20  RBX: ff5eba26ddc9f940  RCX: 0000000000001000
>      RDX: ffffffb559980000  RSI: ff4cb12d67207400  RDI: ffffffffffffffff
>      RBP: 0000000000001000   R8: ff5eba26ddc9f940   R9: ff5eba26ddc9f8af
>      R10: 0000000000000003  R11: 0000000000000a20  R12: ff5eba26ddc9f8b0
>      R13: 000000283c07f000  R14: ff4cb0f5a29a1c00  R15: 0000000000000001
>      ORIG_RAX: ffffffffa07c4e60  CS: 0206  SS: 7000001cf0380001
> bt: WARNING: possibly bogus exception frame
>
>
> Running "crash" with "--machdep irq_eframe_link=0xffffffffffffffe8" (ie. thus irq_eframe_link = -24) works properly:
>
> PID: 2898241  TASK: ff4cb0ce0da0c680  CPU: 30   COMMAND: "star-ccm+"
>   #0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
>   #1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
>   #2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
>   #3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
>   #4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
>      [exception RIP: native_queued_spin_lock_slowpath+636]
>      RIP: ffffffffa0c6b80c  RSP: ff5eba269937ce10  RFLAGS: 00000046
>      RAX: 0000000000000000  RBX: ff4cb12bcfbb2940  RCX: 00000000007c0000
>      RDX: 0000000000000057  RSI: 0000000001600000  RDI: ff4cb12d67205608
>      RBP: ff4cb12d67205608   R8: ff90b9c682475940   R9: 0000000000000000
>      R10: ff4cb0d70ee4e100  R11: 0000000000000000  R12: 0000000000000000
>      R13: 000000000000001e  R14: ff90b9c682474918  R15: ff90b9c682477120
>      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
> --- <NMI exception stack> ---
>   #5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
>   #6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
>   #7 [ff5eba269937ce40] free_iova at ffffffffa07df716
>   #8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
>   #9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
> #10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
> #11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
> #12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
> #13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
> #14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
> #15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
> --- <IRQ stack> ---
> #16 [ff5eba26ddc9f738] asm_sysvec_apic_timer_interrupt at ffffffffa0e00e06
>      [exception RIP: alloc_pte.constprop.0+32]
>      RIP: ffffffffa07c4e60  RSP: ff5eba26ddc9f7e8  RFLAGS: 00000206
>      RAX: ff5eba26ddc9f940  RBX: 0000000000001000  RCX: 0000000000000a20
>      RDX: 0000000000001000  RSI: ffffffb559980000  RDI: ff4cb12d67207400
>      RBP: ff5eba26ddc9f8b0   R8: ff5eba26ddc9f8af   R9: 0000000000000003
>      R10: 0000000000000a20  R11: ff5eba26ddc9f940  R12: 000000283c07f000
>      R13: ff4cb0f5a29a1c00  R14: 0000000000000001  R15: ff4cb0f5a29a1bf8
>      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
> #17 [ff5eba26ddc9f830] iommu_v1_map_pages at ffffffffa07c5648
> #18 [ff5eba26ddc9f8f8] __iommu_map at ffffffffa07d7803
> #19 [ff5eba26ddc9f990] iommu_map_sg at ffffffffa07d7b71
> #20 [ff5eba26ddc9f9f0] iommu_dma_map_sg at ffffffffa07ddcc9
> #21 [ff5eba26ddc9fa90] __dma_map_sg_attrs at ffffffffa01b5205
> #22 [ff5eba26ddc9fa98] dma_map_sgtable at ffffffffa01b526d
> #23 [ff5eba26ddc9faa8] ib_umem_get at ffffffffc112f308 [ib_uverbs]
> #24 [ff5eba26ddc9fb00] mlx5_ib_reg_user_mr at ffffffffc11eb991
> [mlx5_ib]
> #25 [ff5eba26ddc9fb40] ib_uverbs_reg_mr at ffffffffc1123bc4
> [ib_uverbs]
> #26 [ff5eba26ddc9fbb0] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE at
> ffffffffc112d064 [ib_uverbs]
> #27 [ff5eba26ddc9fbe0] ib_uverbs_run_method at ffffffffc112a121
> [ib_uverbs]
> #28 [ff5eba26ddc9fc30] ib_uverbs_cmd_verbs at ffffffffc112a332
> [ib_uverbs]
> #29 [ff5eba26ddc9fe68] ib_uverbs_ioctl at ffffffffc112a494 [ib_uverbs]
> #30 [ff5eba26ddc9fea8] __x64_sys_ioctl at ffffffffa0438f67
> #31 [ff5eba26ddc9fed8] do_syscall_64 at ffffffffa0c55189
> #32 [ff5eba26ddc9ff50] entry_SYSCALL_64_after_hwframe at ffffffffa0e000ea
>      RIP: 0000146ef3a3ec6b  RSP: 00007ffe3b7dc5b8  RFLAGS: 00000246
>      RAX: ffffffffffffffda  RBX: 00007ffe3b7dc6a8  RCX: 0000146ef3a3ec6b
>      RDX: 00007ffe3b7dc690  RSI: 00000000c0181b01  RDI: 0000000000000011
>      RBP: 00007ffe3b7dc670   R8: 0000146ed9549010   R9: 00007ffe3b7dc7c4
>      R10: 0000000000000009  R11: 0000000000000246  R12: 00007ffe3b7dc7c4
>      R13: 000000000000000c  R14: 000000000000000c  R15: 0000146ed9549150
>      ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b
>
>
> Some background:
>
> asm_common_interrupt:
>        callq  error_entry
>        movq   %rax,%rsp
>        movq   %rsp,%rdi
>        movq   0x78(%rsp),%rsi
>        movq   $-0x1,0x78(%rsp)
>        call common_interrupt    # rsp pointing to regs
>   
> common_interrupt
>        pushq  %r12
>        pushq  %rbp
>        pushq  %rbx
>        [...]
>        movq   hardirq_stack_ptr,%r11
>        movq   %rsp,(%r11)
>        movq   %r11,%rsp
>        [...]
>        call __common_interrupt  # rip:common_interrupt
>
> So frame_size(rip:common_interrupt) = 32 (3 push + ret).
>
> Hence "machdep->machspec->irq_eframe_link = -32;"  (see x86_64_irq_eframe_link_init()).
>
> Now:
>
> asm_sysvec_apic_timer_interrupt:
>        pushq  $-0x1
>        callq  error_entry
>        movq   %rax,%rsp
>        movq   %rsp,%rdi
>        callq  sysvec_apic_timer_interrupt
>   
> sysvec_apic_timer_interrupt
>        pushq  %r12
>        pushq  %rbp
>        [...]
>        movq   hardirq_stack_ptr,%r11
>        movq   %rsp,(%r11)
>        movq   %r11,%rsp
>        [...]
>        call __sysvec_apic_timer_interrupt  #
> rip:sysvec_apic_timer_interrupt
>
> Here frame_size(rip:sysvec_apic_timer_interrupt) = 24 (2 push + ret)
>
> We should also notice that:
>
> rip  = *(hardirq_stack_ptr - 8)
> rsp  = *(hardirq_stack_ptr)
> regs = rsp - frame_size(rip)
>
> But x86_64_get_framesize() does not work with IRQ handlers (returns 0).
> So not many options other than hardcoding the most likely value and looking around it.
> Actually x86_64_irq_eframe_link() was trying -32 (default), and then -40,  but not -24 !
>
> Signed-off-by: Georges Aureau<georges.aureau@xxxxxxx>
> --
> diff --git a/x86_64.c b/x86_64.c
> index aec82b0..90c2a91 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -6623,13 +6623,14 @@ x86_64_irq_eframe_link_init(void)
>
>   /*
>    *  Calculate and verify the IRQ exception frame location from the
> - *  stack reference at the top of the IRQ stack, possibly adjusting
> - *  the ms->irq_eframe_link value.
> + *  stack reference at the top of the IRQ stack, keep
> + ms->irq_eframe_link
> + *  as the most likely value, and try a few sizes around it.
>    */
>   static ulong
>   x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE *ofp)
>   {
>          ulong irq_eframe;
> +       int i, try[] = { 8, -8, 16, -16 };
>
>          if (x86_64_exception_frame(EFRAME_VERIFY, stkref, 0, bt, ofp))
>                  return stkref;
> @@ -6639,9 +6640,10 @@ x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE *ofp)
>          if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe, 0, bt, ofp))
>                  return irq_eframe;
>
> -       if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+8, 0, bt, ofp)) {
> -               machdep->machspec->irq_eframe_link -= 8;
> -               return (irq_eframe + 8);
> +       for (i=0; i<sizeof(try)/sizeof(int); i++) {
> +               if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+try[i], 0, bt, ofp)) {
> +                       return (irq_eframe + try[i]);
> +               }
>          }
>
>          return irq_eframe;


Can you help to try the following changes? It could be good to check the relevant symbol, but I'm not sure if this can work well for you and cover some corner cases.

diff --git a/x86_64.c b/x86_64.c
index aec82b03b62e..251e224c013b 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -6574,6 +6574,8 @@ x86_64_irq_eframe_link_init(void)
      if (symbol_exists("asm_common_interrupt")) {
          if (symbol_exists("asm_call_on_stack"))
              machdep->machspec->irq_eframe_link = -64;
+        else if (symbol_exists("sysvec_apic_timer_interrupt"))
+            machdep->machspec->irq_eframe_link = -24;
          else
              machdep->machspec->irq_eframe_link = -32;
          return;


Thanks

Lianbo

--
Crash-utility mailing list -- devel@xxxxxxxxxxxxxxxxxxxxxxxxxxx
To unsubscribe send an email to devel-leave@xxxxxxxxxxxxxxxxxxxxxxxxxxx
https://${domain_name}/admin/lists/devel.lists.crash-utility.osci.io/
Contribution Guidelines: https://github.com/crash-utility/crash/wiki

[Index of Archives]     [Fedora Development]     [Fedora Desktop]     [Fedora SELinux]     [Yosemite News]     [KDE Users]     [Fedora Tools]

 

Powered by Linux