On Wed, Oct 28, 2020 at 7:58 PM Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote: > [...] > --- > include/linux/irqdesc.h | 4 ++ > kernel/irq/manage.c | 3 + > kernel/irq/spurious.c | 74 +++++++++++++++++++++++++++++++++++------------- > 3 files changed, 61 insertions(+), 20 deletions(-) > > --- a/include/linux/irqdesc.h > +++ b/include/linux/irqdesc.h > @@ -30,6 +30,8 @@ struct pt_regs; > * @tot_count: stats field for non-percpu irqs > * @irq_count: stats field to detect stalled irqs > * @last_unhandled: aging timer for unhandled count > + * @storm_count: Counter for irq storm detection > + * @storm_checked: Timestamp for irq storm detection > * @irqs_unhandled: stats field for spurious unhandled interrupts > * @threads_handled: stats field for deferred spurious detection of threaded handlers > * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers > @@ -65,6 +67,8 @@ struct irq_desc { > unsigned int tot_count; > unsigned int irq_count; /* For detecting broken IRQs */ > unsigned long last_unhandled; /* Aging timer for unhandled count */ > + unsigned long storm_count; > + unsigned long storm_checked; > unsigned int irqs_unhandled; > atomic_t threads_handled; > int threads_handled_last; > --- a/kernel/irq/manage.c > +++ b/kernel/irq/manage.c > @@ -1581,6 +1581,9 @@ static int > if (!shared) { > init_waitqueue_head(&desc->wait_for_threads); > > + /* Take a timestamp for interrupt storm detection */ > + desc->storm_checked = jiffies; > + > /* Setup the type (level, edge polarity) if configured: */ > if (new->flags & IRQF_TRIGGER_MASK) { > ret = __irq_set_trigger(desc, > --- a/kernel/irq/spurious.c > +++ b/kernel/irq/spurious.c > @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti > static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); > static int irq_poll_cpu; > static atomic_t irq_poll_active; > +static unsigned long irqstorm_limit __ro_after_init; > > /* > * We wait here for a poller to finish. > @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu > * (The other 100-of-100,000 interrupts may have been a correctly > * functioning device sharing an IRQ with the failing one) > */ > -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) > +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, > + bool storm) > { > unsigned int irq = irq_desc_get_irq(desc); > struct irqaction *action; > unsigned long flags; > > - if (bad_action_ret(action_ret)) { > - printk(KERN_ERR "irq event %d: bogus return value %x\n", > - irq, action_ret); > - } else { > - printk(KERN_ERR "irq %d: nobody cared (try booting with " > + if (!storm) { > + if (bad_action_ret(action_ret)) { > + pr_err("irq event %d: bogus return value %x\n", > + irq, action_ret); > + } else { > + pr_err("irq %d: nobody cared (try booting with " > "the \"irqpoll\" option)\n", irq); > + } > } > dump_stack(); > printk(KERN_ERR "handlers:\n"); > @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de > > if (count > 0) { > count--; > - __report_bad_irq(desc, action_ret); > + __report_bad_irq(desc, action_ret, false); > } > } > > @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru > return action && (action->flags & IRQF_IRQPOLL); > } > > +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret, > + const char *reason, bool storm) > +{ > + __report_bad_irq(desc, action_ret, storm); > + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc)); > + desc->istate |= IRQS_SPURIOUS_DISABLED; > + desc->depth++; > + irq_disable(desc); > +} > + > +/* Interrupt storm detector for runaway interrupts (handled or not). */ > +static bool irqstorm_detected(struct irq_desc *desc) > +{ > + unsigned long now = jiffies; > + > + if (++desc->storm_count < irqstorm_limit) { > + if (time_after(now, desc->storm_checked + HZ)) { > + desc->storm_count = 0; > + desc->storm_checked = now; > + } > + return false; > + } > + > + disable_stuck_irq(desc, IRQ_NONE, "runaway", true); > + return true; > +} > + > #define SPURIOUS_DEFERRED 0x80000000 > > void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) > @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des > desc->irqs_unhandled -= ok; > } > > + if (unlikely(irqstorm_limit && irqstorm_detected(desc))) > + return; > + > desc->irq_count++; > if (likely(desc->irq_count < 100000)) > return; > > desc->irq_count = 0; > if (unlikely(desc->irqs_unhandled > 99900)) { > - /* > - * The interrupt is stuck > - */ > - __report_bad_irq(desc, action_ret); > - /* > - * Now kill the IRQ > - */ > - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); > - desc->istate |= IRQS_SPURIOUS_DISABLED; > - desc->depth++; > - irq_disable(desc); > - > + disable_stuck_irq(desc, action_ret, "unhandled", false); > mod_timer(&poll_spurious_irq_timer, > jiffies + POLL_SPURIOUS_IRQ_INTERVAL); > } > @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st > "performance\n"); > return 1; > } > - > __setup("irqpoll", irqpoll_setup); > + > +static int __init irqstorm_setup(char *arg) > +{ > + int res = kstrtoul(arg, 0, &irqstorm_limit); > + > + if (!res) { > + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n", > + irqstorm_limit); > + } > + return !!res; > +} > +__setup("irqstorm_limit", irqstorm_setup); It should be __setup("irqstorm_limit=", irqstorm_setup); And I have tested this patch on the P9 machine, where I set the limit to 70000. It works for kdump kernel. Thanks, Pingfan