On Wed, Dec 18, 2019 at 03:19:41PM +0800, Ming Lei wrote: > diff --git a/kernel/softirq.c b/kernel/softirq.c > index 0427a86743a4..f6e434ac4183 100644 > --- a/kernel/softirq.c > +++ b/kernel/softirq.c > @@ -25,6 +25,8 @@ > #include <linux/smpboot.h> > #include <linux/tick.h> > #include <linux/irq.h> > +#include <linux/sched.h> > +#include <linux/sched/clock.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/irq.h> > @@ -52,6 +54,26 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); > EXPORT_PER_CPU_SYMBOL(irq_stat); > #endif > > +#define IRQ_INTERVAL_STAGE1_WEIGHT_BITS ilog2(512) > +#define IRQ_INTERVAL_STAGE2_WEIGHT_BITS ilog2(128) That must be the most difficult way of writing 9 and 7 resp. > +#define IRQ_INTERVAL_THRESHOLD_UNIT_NS 1000 > + > +#define IRQ_INTERVAL_MIN_THRESHOLD_NS IRQ_INTERVAL_THRESHOLD_UNIT_NS > +#define IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS 4000000000 (seriously a name with MAX_MIN in it ?!?) That's unreadable, we have (4*NSEC_PER_SEC) for that (if I counted the 0s correctly) These are all a bunch of magic value, any justification for them? Will they always work? > + > +struct irq_interval { > + u64 clock; > + int avg; > + int std_threshold:31; I know of a standard deviation, but what is a standard threshold? > + int stage:1; signed single bit.. there's people that object to that. They figure just a sign bit isn't much useful. > + > + u64 stage_start_clock; > + unsigned stage1_time; > + unsigned stage2_time; > +}; > +DEFINE_PER_CPU(struct irq_interval, avg_irq_interval); > + > static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; > > DEFINE_PER_CPU(struct task_struct *, ksoftirqd); > @@ -339,6 +361,140 @@ asmlinkage __visible void do_softirq(void) > local_irq_restore(flags); > } > > +static inline void irq_interval_update_avg(struct irq_interval *inter, > + u64 now, int weight_bits) > +{ > + inter->avg = inter->avg - ((inter->avg) >> weight_bits) + > + ((now - inter->clock) >> weight_bits); Did you perhaps want to write something like: s64 sample = now - inter->clock; inter->avg += (sample - inter->avg) >> weight_bits; Which is a recognisable form. It also shows the obvious overflow when sample is large (interrupts didn't happen for a while). You'll want to clamp @sample to some max. > + if (unlikely(inter->avg < 0)) > + inter->avg = 0; And since inter->avg must never be <0, wth are you using a signed bitfield? This generates shit code. Use an on-stack temporary if anything: int avg = inter->avg; avg += (sample - avg) >> bits; if (avg < 0) avg = 0; inter->avg = avg; and presto! no signed bitfields required. > +} > + > +/* > + * Keep the ratio of stage2 time to stage1 time between 1/2 and 1/8. If > + * it is out of the range, adjust .std_threshold for maintaining the ratio. it is either @std_threshold or @irq_interval::std_threshold > + */ > +static inline void irq_interval_update_threshold(struct irq_interval *inter) > +{ > + if (inter->stage2_time * 2 > inter->stage1_time) > + inter->std_threshold -= IRQ_INTERVAL_THRESHOLD_UNIT_NS; > + if (inter->stage2_time * 8 < inter->stage1_time) > + inter->std_threshold += IRQ_INTERVAL_THRESHOLD_UNIT_NS; I suppose that will eventually converge. > + if (inter->std_threshold <= 0) > + inter->std_threshold = IRQ_INTERVAL_THRESHOLD_UNIT_NS; I think you'll find you actually meant to write: if (inter->std_threshold < IRQ_INTERVAL_THRESHOLD_UNIT_NS) > + if (inter->std_threshold >= 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS) > + inter->std_threshold = 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS; We actually have a macro for this: inter->std_threshold = clamp(inter->std_threshold, IRQ_INTERVAL_THRESHOLD_UNIT_NS, 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS); > +} > + > +/* > + * If we stay at stage1 for too long with minimized threshold and low enough > + * interrupt average interval, there may have risk to lock up CPU. It's not locked up... > + */ > +static bool irq_interval_cpu_lockup_risk(struct irq_interval *inter, u64 now) > +{ > + if (inter->avg > inter->std_threshold) > + return false; > + > + if (inter->std_threshold != IRQ_INTERVAL_MIN_THRESHOLD_NS) > + return false; > + > + if (now - inter->stage_start_clock <= > + IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS) > + return false; > + return true; > +} > + > +/* > + * Update average interrupt interval with the Exponential Weighted Moving > + * Average(EWMA), and implement two-stage interrupt flood detection. > + * > + * Use scheduler's runqueue software clock at default for figuring > + * interrupt interval for saving cost. When the interval becomes zero, > + * it is reasonable to conclude scheduler's activity on this CPU has been > + * stopped because of interrupt flood. Then switch to the 2nd stage > + * detection in which clock is read from hardware, and the detection > + * result can be more reliable. > + */ > +static void irq_interval_update(void) > +{ > + struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval); raw_cpu_ptr is wrong, this wants to be this_cpu_ptr() > + u64 now; > + > + if (likely(!inter->stage)) { > + now = sched_local_rq_clock(); > + irq_interval_update_avg(inter, now, > + IRQ_INTERVAL_STAGE1_WEIGHT_BITS); > + > + if (unlikely(inter->avg < inter->std_threshold / 2 || > + irq_interval_cpu_lockup_risk(inter, now))) { > + inter->stage = 1; > + now = sched_clock_cpu(smp_processor_id()); > + inter->stage1_time = now - inter->stage_start_clock; > + inter->stage_start_clock = now; > + > + /* > + * reset as the mean of the min and the max value of > + * stage2's threshold > + */ > + inter->avg = inter->std_threshold + > + (inter->std_threshold >> 2); > + } > + } else { > + now = sched_clock_cpu(smp_processor_id()); > + > + irq_interval_update_avg(inter, now, > + IRQ_INTERVAL_STAGE2_WEIGHT_BITS); > + > + if (inter->avg > inter->std_threshold * 2) { > + inter->stage = 0; > + inter->stage2_time = now - inter->stage_start_clock; > + inter->stage_start_clock = now; > + > + irq_interval_update_threshold(inter); > + } > + } > +} AFAICT the only reason for much of this complexity is so that you can use this sched_local_rq_clock() thing, right? Once that reaches a threshold, you go use the more accurate sched_clock_cpu() and once that tickles the threshold you call it golden and raise hell. So pray tell, why did you not integrate this with IRQ_TIME_ACCOUNTING ? That already takes a timestamp and does most of what you need. > @@ -356,6 +512,7 @@ void irq_enter(void) > } > > __irq_enter(); > + irq_interval_update(); > } Arggh.. you're going to make every single interrupt take at least 2 extra cache misses for this gunk?!? And it lumps all interrupts on a single heap, and doesn't do any of the otherwise useful things we've been wanting to have IRQ timings for :/ _If_ you want to do something like this, do it like the below. That only adds a few instruction to irq_exit() and only touches a cacheline that's already touched. It computes both the avg duration and the avg inter-arrival-time of hardirqs. Things get critical when: inter-arrival-avg < 2*duration-avg or something like that. --- diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d43318a489f2..6f5ef70b5a1d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -50,7 +50,7 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, void irqtime_account_irq(struct task_struct *curr) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); - s64 delta; + s64 delta, iat; int cpu; if (!sched_clock_irqtime) @@ -58,7 +58,6 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; - irqtime->irq_start_time += delta; /* * We do not account for softirq time from ksoftirqd here. @@ -66,10 +65,21 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (hardirq_count()) { irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + + /* this is irq_exit(), delta is the duration of the hardirq */ + irqtime->duration_avg += (delta - irqtime->duration_avg) >> 7; + + /* compute the inter arrival time using the previous arrival_time */ + iat = irqtime->irq_start_time - irqtime->irq_arrival_time; + irqtime->irq_arrival_time += iat; + irqtime->irq_inter_arrival_avg += (iat - irqtime->inter_arrival_avg) >> 7; + + } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + + irqtime->irq_start_time += delta; } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 280a3c735935..cab07e5a6c11 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2236,6 +2236,9 @@ struct irqtime { u64 total; u64 tick_delta; u64 irq_start_time; + u64 irq_duration_avg; + u64 irq_arrival_time; + u64 irq_inter_arrival_avg; struct u64_stats_sync sync; };