On Mon, Jun 05 2023 at 10:09, H. Peter Anvin wrote: > On 6/5/23 10:07, Thomas Gleixner wrote: >> There is zero reason for this to be an IPI. This can be delegated to a >> worker or whatever delayed mechanism. >> > As we discussed offline, I agree that this is a better solution (and > should be a separate changeset before the FRED one.) The untested below should do the trick. Wants to be split in several patches, but you get the idea. Thanks, tglx --- Subject: x86/vector: Get rid of IRQ_MOVE_CLEANUP_VECTOR From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> No point to waste a vector for cleaning up the leftovers of a moved interrupt. Aside of that this must be the lowest priority of all vectors which makes FRED systems utilizing vectors 0x10-0x1f more complicated than necessary. Schedule a timer instead. Not-Yet-Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> --- arch/x86/include/asm/hw_irq.h | 4 - arch/x86/include/asm/idtentry.h | 1 arch/x86/include/asm/irq_vectors.h | 7 --- arch/x86/kernel/apic/vector.c | 83 ++++++++++++++++++++++++++---------- arch/x86/kernel/idt.c | 1 arch/x86/platform/uv/uv_irq.c | 2 drivers/iommu/amd/iommu.c | 2 drivers/iommu/hyperv-iommu.c | 4 - drivers/iommu/intel/irq_remapping.c | 2 9 files changed, 68 insertions(+), 38 deletions(-) --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct i extern void lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *); +extern void vector_schedule_cleanup(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void send_cleanup_vector(struct irq_cfg *c) { } +static inline void vector_schedule_cleanup(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI #ifdef CONFIG_SMP DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -35,13 +35,6 @@ */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for - * triggering cleanup after irq migration. 0x21-0x2f will still be used - * for device interrupts. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - #define IA32_SYSCALL_VECTOR 0x80 /* --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list); + +static void vector_cleanup_callback(struct timer_list *tmr); + +struct vector_cleanup { + struct hlist_head head; + struct timer_list timer; +}; + +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { + .head = HLIST_HEAD_INIT, + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED), +}; #endif void lock_vector_lock(void) @@ -843,8 +854,12 @@ void lapic_online(void) void lapic_offline(void) { + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); + lock_vector_lock(); irq_matrix_offline(vector_matrix); + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(!hlist_empty(&cl->head)); unlock_vector_lock(); } @@ -934,62 +949,86 @@ static void free_moved_vector(struct api apicd->move_in_progress = 0; } -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) +static void vector_cleanup_callback(struct timer_list *tmr) { - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); struct apic_chip_data *apicd; struct hlist_node *tmp; + bool rearm = false; - ack_APIC_irq(); /* Prevent vectors vanishing under us */ - raw_spin_lock(&vector_lock); + raw_spin_lock_irq(&vector_lock); - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { unsigned int irr, vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned - * up is registered at the APICs IRR. If so, then this is - * not the best time to clean it up. Clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest - * priority external vector, so on return from this - * interrupt the device interrupt will happen first. + * up is registered at the APICs IRR. That's clearly a + * hardware issue if the vector arrived on the old target + * _after_ interrupts were disabled above. Keep @apicd + * on the list and schedule the timer again to give the CPU + * a chance to handle the pending interrupt. */ irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1U << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); + rearm = true; continue; } free_moved_vector(apicd); } - raw_spin_unlock(&vector_lock); + /* + * Must happen under vector_lock to make the timer_pending() check + * in __vector_schedule_cleanup() race free against the rearm here. + */ + if (rearm) + mod_timer(tmr, jiffies + 1); + + raw_spin_unlock_irq(&vector_lock); } -static void __send_cleanup_vector(struct apic_chip_data *apicd) +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) { - unsigned int cpu; + unsigned int cpu = apicd->prev_cpu; raw_spin_lock(&vector_lock); apicd->move_in_progress = 0; - cpu = apicd->prev_cpu; if (cpu_online(cpu)) { - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); + + /* + * The lockless timer_pending() check is safe here. If it + * returns true, then the callback will observe this new + * apic data in the hlist as everything is serialized by + * vector lock. + * + * If it returns false then the timer is either not armed + * or the other CPU executes the callback, which again + * would be blocked on vector lock. Rearming it in the + * latter case makes it fire for nothing. + * + * This is also safe against the callback rearming the timer + * because that's serialized via vector lock too. + */ + if (!timer_pending(&cl->timer)) { + cl->timer.expires = jiffies + 1; + add_timer_on(&cl->timer, cpu); + } } else { apicd->prev_vector = 0; } raw_spin_unlock(&vector_lock); } -void send_cleanup_vector(struct irq_cfg *cfg) +void vector_schedule_cleanup(struct irq_cfg *cfg) { struct apic_chip_data *apicd; apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (apicd->move_in_progress) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1046,7 @@ void irq_complete_move(struct irq_cfg *c * on the same CPU. */ if (apicd->cpu == smp_processor_id()) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } /* --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -131,7 +131,6 @@ static const __initconst struct idt_data INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), - INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *dat ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { uv_program_mmr(cfg, data->chip_data); - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); } return ret; --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3639,7 +3639,7 @@ static int amd_ir_set_affinity(struct ir * at the new destination. So, time to cleanup the previous * vector allocation. */ - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); return IRQ_SET_MASK_OK_DONE; } --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -51,7 +51,7 @@ static int hyperv_ir_set_affinity(struct if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); return 0; } @@ -257,7 +257,7 @@ static int hyperv_root_ir_set_affinity(s if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); return 0; } --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1180,7 +1180,7 @@ intel_ir_set_affinity(struct irq_data *d * at the new destination. So, time to cleanup the previous * vector allocation. */ - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); return IRQ_SET_MASK_OK_DONE; }