>>>For some high performance IO devices, interrupt may come very frequently, >>>meantime IO request completion may take a bit time. Especially on some >>>devices(SCSI or NVMe), IO requests can be submitted concurrently from >>>multiple CPU cores, however IO completion is only done on one of these >>>submission CPU cores. >>> >>>Then IRQ flood can be easily triggered, and CPU lockup. >>> >>>Implement one simple generic CPU IRQ flood detection mechanism. This >>>mechanism uses the CPU average interrupt interval to evaluate if IRQ flood >>>is triggered. The Exponential Weighted Moving Average(EWMA) is used to >>>compute CPU average interrupt interval. >>> >>>Cc: Long Li <longli@xxxxxxxxxxxxx> >>>Cc: Ingo Molnar <mingo@xxxxxxxxxx>, >>>Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> >>>Cc: Keith Busch <keith.busch@xxxxxxxxx> >>>Cc: Jens Axboe <axboe@xxxxxx> >>>Cc: Christoph Hellwig <hch@xxxxxx> >>>Cc: Sagi Grimberg <sagi@xxxxxxxxxxx> >>>Cc: John Garry <john.garry@xxxxxxxxxx> >>>Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> >>>Cc: Hannes Reinecke <hare@xxxxxxxx> >>>Cc: linux-nvme@xxxxxxxxxxxxxxxxxxx >>>Cc: linux-scsi@xxxxxxxxxxxxxxx >>>Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> >>>--- >>> drivers/base/cpu.c | 25 ++++++++++++++++++++++ >>> include/linux/hardirq.h | 2 ++ >>> kernel/softirq.c | 46 >>>+++++++++++++++++++++++++++++++++++++++++ >>> 3 files changed, 73 insertions(+) >>> >>>diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index >>>cc37511de866..7277d1aa0906 100644 >>>--- a/drivers/base/cpu.c >>>+++ b/drivers/base/cpu.c >>>@@ -20,6 +20,7 @@ >>> #include <linux/tick.h> >>> #include <linux/pm_qos.h> >>> #include <linux/sched/isolation.h> >>>+#include <linux/hardirq.h> >>> >>> #include "base.h" >>> >>>@@ -183,10 +184,33 @@ static struct attribute_group >>>crash_note_cpu_attr_group = { }; #endif >>> >>>+static ssize_t show_irq_interval(struct device *dev, >>>+ struct device_attribute *attr, char *buf) { >>>+ struct cpu *cpu = container_of(dev, struct cpu, dev); >>>+ ssize_t rc; >>>+ int cpunum; >>>+ >>>+ cpunum = cpu->dev.id; >>>+ >>>+ rc = sprintf(buf, "%llu\n", irq_get_avg_interval(cpunum)); >>>+ return rc; >>>+} >>>+ >>>+static DEVICE_ATTR(irq_interval, 0400, show_irq_interval, NULL); static >>>+struct attribute *irq_interval_cpu_attrs[] = { >>>+ &dev_attr_irq_interval.attr, >>>+ NULL >>>+}; >>>+static struct attribute_group irq_interval_cpu_attr_group = { >>>+ .attrs = irq_interval_cpu_attrs, >>>+}; >>>+ >>> static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef >>>CONFIG_KEXEC >>> &crash_note_cpu_attr_group, >>> #endif >>>+ &irq_interval_cpu_attr_group, >>> NULL >>> }; >>> >>>@@ -194,6 +218,7 @@ static const struct attribute_group >>>*hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC >>> &crash_note_cpu_attr_group, >>> #endif >>>+ &irq_interval_cpu_attr_group, >>> NULL >>> }; >>> >>>diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index >>>da0af631ded5..fd394060ddb3 100644 >>>--- a/include/linux/hardirq.h >>>+++ b/include/linux/hardirq.h >>>@@ -8,6 +8,8 @@ >>> #include <linux/vtime.h> >>> #include <asm/hardirq.h> >>> >>>+extern u64 irq_get_avg_interval(int cpu); extern bool >>>+irq_flood_detected(void); >>> >>> extern void synchronize_irq(unsigned int irq); extern bool >>>synchronize_hardirq(unsigned int irq); diff --git a/kernel/softirq.c >>>b/kernel/softirq.c index 0427a86743a4..96e01669a2e0 100644 >>>--- a/kernel/softirq.c >>>+++ b/kernel/softirq.c >>>@@ -25,6 +25,7 @@ >>> #include <linux/smpboot.h> >>> #include <linux/tick.h> >>> #include <linux/irq.h> >>>+#include <linux/sched/clock.h> >>> >>> #define CREATE_TRACE_POINTS >>> #include <trace/events/irq.h> >>>@@ -52,6 +53,12 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); >>>EXPORT_PER_CPU_SYMBOL(irq_stat); #endif >>> >>>+struct irq_interval { >>>+ u64 last_irq_end; >>>+ u64 avg; >>>+}; >>>+DEFINE_PER_CPU(struct irq_interval, avg_irq_interval); >>>+ >>> static struct softirq_action softirq_vec[NR_SOFTIRQS] >>>__cacheline_aligned_in_smp; >>> >>> DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -339,6 +346,41 @@ >>>asmlinkage __visible void do_softirq(void) >>> local_irq_restore(flags); >>> } >>> >>>+/* >>>+ * Update average irq interval with the Exponential Weighted Moving >>>+ * Average(EWMA) >>>+ */ >>>+static void irq_update_interval(void) >>>+{ >>>+#define IRQ_INTERVAL_EWMA_WEIGHT 128 >>>+#define IRQ_INTERVAL_EWMA_PREV_FACTOR 127 >>>+#define IRQ_INTERVAL_EWMA_CURR_FACTOR >>> (IRQ_INTERVAL_EWMA_WEIGHT - \ >>>+ IRQ_INTERVAL_EWMA_PREV_FACTOR) >>>+ >>>+ int cpu = raw_smp_processor_id(); >>>+ struct irq_interval *inter = per_cpu_ptr(&avg_irq_interval, cpu); >>>+ u64 delta = sched_clock_cpu(cpu) - inter->last_irq_end; >>>+ >>>+ inter->avg = (inter->avg * IRQ_INTERVAL_EWMA_PREV_FACTOR + inter->avg will start with 0? maybe use a bigger value like IRQ_FLOOD_THRESHOLD_NS >>>+ delta * IRQ_INTERVAL_EWMA_CURR_FACTOR) / >>>+ IRQ_INTERVAL_EWMA_WEIGHT; >>>+} >>>+ >>>+u64 irq_get_avg_interval(int cpu) >>>+{ >>>+ return per_cpu_ptr(&avg_irq_interval, cpu)->avg; } >>>+ >>>+/* >>>+ * If the average CPU irq interval is less than 8us, we think interrupt >>>+ * flood is detected on this CPU >>>+ */ >>>+bool irq_flood_detected(void) >>>+{ >>>+#define IRQ_FLOOD_THRESHOLD_NS 8000 >>>+ return raw_cpu_ptr(&avg_irq_interval)->avg <= >>>IRQ_FLOOD_THRESHOLD_NS; >>>+} >>>+ >>> /* >>> * Enter an interrupt context. >>> */ >>>@@ -356,6 +398,7 @@ void irq_enter(void) >>> } >>> >>> __irq_enter(); >>>+ irq_update_interval(); >>> } >>> >>> static inline void invoke_softirq(void) @@ -402,6 +445,8 @@ static inline >>>void tick_irq_exit(void) >>> */ >>> void irq_exit(void) >>> { >>>+ struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval); >>>+ >>> #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED >>> local_irq_disable(); >>> #else >>>@@ -413,6 +458,7 @@ void irq_exit(void) >>> invoke_softirq(); >>> >>> tick_irq_exit(); >>>+ inter->last_irq_end = sched_clock_cpu(smp_processor_id()); >>> rcu_irq_exit(); >>> trace_hardirq_exit(); /* must be last! */ } >>>-- >>>2.20.1