> The untested below should do the trick. Wants to be split in several patches, but > you get the idea. I will continue the work from what you posted. Thanks a lot! Xin > --- > Subject: x86/vector: Get rid of IRQ_MOVE_CLEANUP_VECTOR > From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > > No point to waste a vector for cleaning up the leftovers of a moved interrupt. > Aside of that this must be the lowest priority of all vectors which makes FRED > systems utilizing vectors 0x10-0x1f more complicated than necessary. > > Schedule a timer instead. > > Not-Yet-Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > --- > arch/x86/include/asm/hw_irq.h | 4 - > arch/x86/include/asm/idtentry.h | 1 > arch/x86/include/asm/irq_vectors.h | 7 --- > arch/x86/kernel/apic/vector.c | 83 ++++++++++++++++++++++++++---------- > arch/x86/kernel/idt.c | 1 > arch/x86/platform/uv/uv_irq.c | 2 > drivers/iommu/amd/iommu.c | 2 > drivers/iommu/hyperv-iommu.c | 4 - > drivers/iommu/intel/irq_remapping.c | 2 > 9 files changed, 68 insertions(+), 38 deletions(-) > > --- a/arch/x86/include/asm/hw_irq.h > +++ b/arch/x86/include/asm/hw_irq.h > @@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct i extern void > lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef > CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *); > +extern void vector_schedule_cleanup(struct irq_cfg *); > extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void > send_cleanup_vector(struct irq_cfg *c) { } > +static inline void vector_schedule_cleanup(struct irq_cfg *c) { } > static inline void irq_complete_move(struct irq_cfg *c) { } #endif > > --- a/arch/x86/include/asm/idtentry.h > +++ b/arch/x86/include/asm/idtentry.h > @@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI > > #ifdef CONFIG_SMP > DECLARE_IDTENTRY(RESCHEDULE_VECTOR, > sysvec_reschedule_ipi); > -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, > sysvec_irq_move_cleanup); > DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, > sysvec_reboot); > DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, > sysvec_call_function_single); > DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, > sysvec_call_function); > --- a/arch/x86/include/asm/irq_vectors.h > +++ b/arch/x86/include/asm/irq_vectors.h > @@ -35,13 +35,6 @@ > */ > #define FIRST_EXTERNAL_VECTOR 0x20 > > -/* > - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for > - * triggering cleanup after irq migration. 0x21-0x2f will still be used > - * for device interrupts. > - */ > -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR > - > #define IA32_SYSCALL_VECTOR 0x80 > > /* > --- a/arch/x86/kernel/apic/vector.c > +++ b/arch/x86/kernel/apic/vector.c > @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct > irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef > CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list); > + > +static void vector_cleanup_callback(struct timer_list *tmr); > + > +struct vector_cleanup { > + struct hlist_head head; > + struct timer_list timer; > +}; > + > +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { > + .head = HLIST_HEAD_INIT, > + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, > TIMER_PINNED), > +}; > #endif > > void lock_vector_lock(void) > @@ -843,8 +854,12 @@ void lapic_online(void) > > void lapic_offline(void) > { > + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); > + > lock_vector_lock(); > irq_matrix_offline(vector_matrix); > + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); > + WARN_ON_ONCE(!hlist_empty(&cl->head)); > unlock_vector_lock(); > } > > @@ -934,62 +949,86 @@ static void free_moved_vector(struct api > apicd->move_in_progress = 0; > } > > -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) > +static void vector_cleanup_callback(struct timer_list *tmr) > { > - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); > + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); > struct apic_chip_data *apicd; > struct hlist_node *tmp; > + bool rearm = false; > > - ack_APIC_irq(); > /* Prevent vectors vanishing under us */ > - raw_spin_lock(&vector_lock); > + raw_spin_lock_irq(&vector_lock); > > - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { > + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { > unsigned int irr, vector = apicd->prev_vector; > > /* > * Paranoia: Check if the vector that needs to be cleaned > - * up is registered at the APICs IRR. If so, then this is > - * not the best time to clean it up. Clean it up in the > - * next attempt by sending another > IRQ_MOVE_CLEANUP_VECTOR > - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest > - * priority external vector, so on return from this > - * interrupt the device interrupt will happen first. > + * up is registered at the APICs IRR. That's clearly a > + * hardware issue if the vector arrived on the old target > + * _after_ interrupts were disabled above. Keep @apicd > + * on the list and schedule the timer again to give the CPU > + * a chance to handle the pending interrupt. > */ > irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); > if (irr & (1U << (vector % 32))) { > - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); > + pr_warn_once("Moved interrupt pending in old target > APIC %u\n", apicd->irq); > + rearm = true; > continue; > } > free_moved_vector(apicd); > } > > - raw_spin_unlock(&vector_lock); > + /* > + * Must happen under vector_lock to make the timer_pending() check > + * in __vector_schedule_cleanup() race free against the rearm here. > + */ > + if (rearm) > + mod_timer(tmr, jiffies + 1); > + > + raw_spin_unlock_irq(&vector_lock); > } > > -static void __send_cleanup_vector(struct apic_chip_data *apicd) > +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) > { > - unsigned int cpu; > + unsigned int cpu = apicd->prev_cpu; > > raw_spin_lock(&vector_lock); > apicd->move_in_progress = 0; > - cpu = apicd->prev_cpu; > if (cpu_online(cpu)) { > - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); > - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); > + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); > + > + /* > + * The lockless timer_pending() check is safe here. If it > + * returns true, then the callback will observe this new > + * apic data in the hlist as everything is serialized by > + * vector lock. > + * > + * If it returns false then the timer is either not armed > + * or the other CPU executes the callback, which again > + * would be blocked on vector lock. Rearming it in the > + * latter case makes it fire for nothing. > + * > + * This is also safe against the callback rearming the timer > + * because that's serialized via vector lock too. > + */ > + if (!timer_pending(&cl->timer)) { > + cl->timer.expires = jiffies + 1; > + add_timer_on(&cl->timer, cpu); > + } > } else { > apicd->prev_vector = 0; > } > raw_spin_unlock(&vector_lock); > } > > -void send_cleanup_vector(struct irq_cfg *cfg) > +void vector_schedule_cleanup(struct irq_cfg *cfg) > { > struct apic_chip_data *apicd; > > apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); > if (apicd->move_in_progress) > - __send_cleanup_vector(apicd); > + __vector_schedule_cleanup(apicd); > } > > void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1046,7 @@ void > irq_complete_move(struct irq_cfg *c > * on the same CPU. > */ > if (apicd->cpu == smp_processor_id()) > - __send_cleanup_vector(apicd); > + __vector_schedule_cleanup(apicd); > } > > /* > --- a/arch/x86/kernel/idt.c > +++ b/arch/x86/kernel/idt.c > @@ -131,7 +131,6 @@ static const __initconst struct idt_data > INTG(RESCHEDULE_VECTOR, > asm_sysvec_reschedule_ipi), > INTG(CALL_FUNCTION_VECTOR, > asm_sysvec_call_function), > INTG(CALL_FUNCTION_SINGLE_VECTOR, > asm_sysvec_call_function_single), > - INTG(IRQ_MOVE_CLEANUP_VECTOR, > asm_sysvec_irq_move_cleanup), > INTG(REBOOT_VECTOR, asm_sysvec_reboot), > #endif > > --- a/arch/x86/platform/uv/uv_irq.c > +++ b/arch/x86/platform/uv/uv_irq.c > @@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *dat > ret = parent->chip->irq_set_affinity(parent, mask, force); > if (ret >= 0) { > uv_program_mmr(cfg, data->chip_data); > - send_cleanup_vector(cfg); > + vector_schedule_cleanup(cfg); > } > > return ret; > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -3639,7 +3639,7 @@ static int amd_ir_set_affinity(struct ir > * at the new destination. So, time to cleanup the previous > * vector allocation. > */ > - send_cleanup_vector(cfg); > + vector_schedule_cleanup(cfg); > > return IRQ_SET_MASK_OK_DONE; > } > --- a/drivers/iommu/hyperv-iommu.c > +++ b/drivers/iommu/hyperv-iommu.c > @@ -51,7 +51,7 @@ static int hyperv_ir_set_affinity(struct > if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) > return ret; > > - send_cleanup_vector(cfg); > + vector_schedule_cleanup(cfg); > > return 0; > } > @@ -257,7 +257,7 @@ static int hyperv_root_ir_set_affinity(s > if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) > return ret; > > - send_cleanup_vector(cfg); > + vector_schedule_cleanup(cfg); > > return 0; > } > --- a/drivers/iommu/intel/irq_remapping.c > +++ b/drivers/iommu/intel/irq_remapping.c > @@ -1180,7 +1180,7 @@ intel_ir_set_affinity(struct irq_data *d > * at the new destination. So, time to cleanup the previous > * vector allocation. > */ > - send_cleanup_vector(cfg); > + vector_schedule_cleanup(cfg); > > return IRQ_SET_MASK_OK_DONE; > }