When guest changes its interrupt configuration (such as, vector, etc.) for direct-assigned devices, we need to update the associated IRTE with the new guest vector, so external interrupts from the assigned devices can be injected to guests without VM-Exit. The current method of handling guest lowest priority interrtups is to use a counter 'apic_arb_prio' for each VCPU, we choose the VCPU with smallest 'apic_arb_prio' and then increase it by 1. However, for VT-d PI, we cannot re-use this, since we no longer have control to 'apic_arb_prio' with posted interrupt direct delivery by Hardware. Here, we introduce a similiar way with 'apic_arb_prio' to handle guest lowest priority interrtups when VT-d PI is used. Here is the ideas: - Each VCPU has a counter 'round_robin_counter'. - When guests sets an interrupts to lowest priority, we choose the VCPU with smallest 'round_robin_counter' as the destination, then increase it. Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx> --- arch/x86/include/asm/irq_remapping.h | 6 ++ arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/vmx.c | 12 +++ arch/x86/kvm/x86.c | 11 +++ drivers/iommu/amd_iommu.c | 6 ++ drivers/iommu/intel_irq_remapping.c | 28 +++++++ drivers/iommu/irq_remapping.c | 9 ++ drivers/iommu/irq_remapping.h | 3 + include/linux/dmar.h | 26 ++++++ include/linux/kvm_host.h | 22 +++++ include/uapi/linux/kvm.h | 1 + virt/kvm/assigned-dev.c | 141 ++++++++++++++++++++++++++++++++++ virt/kvm/irq_comm.c | 4 +- virt/kvm/irqchip.c | 11 --- 14 files changed, 269 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index a3cc437..32d6cc4 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -51,6 +51,7 @@ extern void compose_remapped_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id); extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); +extern int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector); extern void panic_if_irq_remap(const char *msg); extern bool setup_remapped_irq(int irq, struct irq_cfg *cfg, @@ -88,6 +89,11 @@ static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) return -ENODEV; } +static inline int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector) +{ + return -ENODEV; +} + static inline void panic_if_irq_remap(const char *msg) { } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ed0c30..0630161 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -358,6 +358,7 @@ struct kvm_vcpu_arch { struct kvm_lapic *apic; /* kernel irqchip context */ unsigned long apic_attention; int32_t apic_arb_prio; + int32_t round_robin_counter; int mp_state; u64 ia32_misc_enable_msr; bool tpr_access_reporting; @@ -771,6 +772,7 @@ struct kvm_x86_ops { int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4670d3..ae91b72 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -544,6 +544,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -4280,6 +4285,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) return; } +static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu) +{ + return __pa((u64)vcpu_to_pi_desc(vcpu)); +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -9232,6 +9242,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_nested_events = vmx_check_nested_events, .sched_in = vmx_sched_in, + + .get_pi_desc_addr = vmx_get_pi_desc_addr, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b447a98..0c19d15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7735,6 +7735,17 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 guest_vector, int host_irq) +{ + u64 pi_desc_addr = kvm_x86_ops->get_pi_desc_addr(vcpu); + + if (update_pi_irte(host_irq, pi_desc_addr, guest_vector)) + return -1; + + return 0; +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 505a9ad..a36fdc7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4280,6 +4280,11 @@ static int alloc_hpet_msi(unsigned int irq, unsigned int id) return 0; } +static int dummy_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector) +{ + return -EINVAL; +} + struct irq_remap_ops amd_iommu_irq_ops = { .supported = amd_iommu_supported, .prepare = amd_iommu_prepare, @@ -4294,5 +4299,6 @@ struct irq_remap_ops amd_iommu_irq_ops = { .msi_alloc_irq = msi_alloc_irq, .msi_setup_irq = msi_setup_irq, .alloc_hpet_msi = alloc_hpet_msi, + .update_pi_irte = dummy_update_pi_irte, }; #endif diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 776da10..87c02fe 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1172,6 +1172,33 @@ static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id) return ret; } +static int intel_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector) +{ + struct irte irte; + + if (get_irte(irq, &irte)) + return -1; + + irte.irq_post_low.urg = 0; + irte.irq_post_low.vector = vector; + irte.irq_post_low.pda_l = (pi_desc_addr >> (32 - PDA_LOW_BIT)) & + ~(-1UL << PDA_LOW_BIT); + irte.irq_post_high.pda_h = (pi_desc_addr >> 32) & + ~(-1UL << PDA_HIGH_BIT); + + irte.irq_post_low.__reserved_1 = 0; + irte.irq_post_low.__reserved_2 = 0; + irte.irq_post_low.__reserved_3 = 0; + irte.irq_post_high.__reserved_4 = 0; + + irte.irq_post_low.pst = 1; + + if (modify_irte(irq, &irte)) + return -1; + + return 0; +} + struct irq_remap_ops intel_irq_remap_ops = { .supported = intel_irq_remapping_supported, .prepare = dmar_table_init, @@ -1186,4 +1213,5 @@ struct irq_remap_ops intel_irq_remap_ops = { .msi_alloc_irq = intel_msi_alloc_irq, .msi_setup_irq = intel_msi_setup_irq, .alloc_hpet_msi = intel_alloc_hpet_msi, + .update_pi_irte = intel_update_pi_irte, }; diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 2f8ee00..0e36860 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -362,6 +362,15 @@ int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) return default_setup_hpet_msi(irq, id); } +int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector) +{ + if (!remap_ops || !remap_ops->update_pi_irte) + return -ENODEV; + + return remap_ops->update_pi_irte(irq, pi_desc_addr, vector); +} +EXPORT_SYMBOL_GPL(update_pi_irte); + void panic_if_irq_remap(const char *msg) { if (irq_remapping_enabled) diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 7bb5913..2d8f740 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -84,6 +84,9 @@ struct irq_remap_ops { /* Setup interrupt remapping for an HPET MSI */ int (*alloc_hpet_msi)(unsigned int, unsigned int); + + /* Update IRTE for posted-interrupt */ + int (*update_pi_irte)(int irq, u64 pi_desc_addr, u32 vector); }; extern struct irq_remap_ops intel_irq_remap_ops; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 8be5d42..e1ff4f7 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -160,6 +160,20 @@ struct irte { __reserved_2 : 8, dest_id : 32; } irq_remap_low; + + struct { + __u64 present : 1, + fpd : 1, + __reserved_1 : 6, + avail : 4, + __reserved_2 : 2, + urg : 1, + pst : 1, + vector : 8, + __reserved_3 : 14, + pda_l : 26; + } irq_post_low; + __u64 low; }; @@ -170,10 +184,22 @@ struct irte { svt : 2, __reserved_3 : 44; } irq_remap_high; + + struct { + __u64 sid: 16, + sq: 2, + svt: 2, + __reserved_4: 12, + pda_h: 32; + } irq_post_high; + __u64 high; }; }; +#define PDA_LOW_BIT 26 +#define PDA_HIGH_BIT 32 + enum { IRQ_REMAP_XAPIC_MODE, IRQ_REMAP_X2APIC_MODE, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea53b04..6bb8287 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -335,6 +335,25 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING + +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + struct kvm_kernel_irq_routing_entry *rt_entries; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; + +#else + +struct kvm_irq_routing_table {}; + +#endif + #ifndef KVM_PRIVATE_MEM_SLOTS #define KVM_PRIVATE_MEM_SLOTS 0 #endif @@ -766,6 +785,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); +bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq); #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7593c52..509223a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1027,6 +1027,7 @@ struct kvm_s390_ucas_mapping { #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +#define KVM_ASSIGN_DEV_PI_UPDATE _IOR(KVMIO, 0x7d, __u32) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index e05000e..e154009 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -326,6 +326,135 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) } } +int __weak kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 guest_vector, int host_irq) +{ + return 0; +} + +int kvm_compare_rr_counter(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) +{ + return vcpu1->arch.round_robin_counter - + vcpu2->arch.round_robin_counter; +} + +bool kvm_pi_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu, *dest = NULL; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + r++; + *dest_vcpu = vcpu; + } else if (kvm_lapic_enabled(vcpu)) { + if (!dest) + dest = vcpu; + else if (kvm_compare_rr_counter(vcpu, dest) < 0) + dest = vcpu; + } + } + + if (dest) { + dest->arch.round_robin_counter++; + *dest_vcpu = dest; + return true; + } else if (r == 1) + return true; + + return false; +} + +static int __kvm_update_pi_irte(struct kvm *kvm, int host_irq, int guest_irq) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + int idx, ret = -EINVAL; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + ASSERT(guest_irq < irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d posted-interrupt has the following + * limitations: + * - No support for posting multicast/broadcast + * interrupts to a VCPU + * Still use interrupt remapping for these + * kind of interrupts + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_pi_find_dest_vcpu(kvm, &irq, &vcpu)) { + printk(KERN_INFO "%s: can not find the target VCPU\n", + __func__); + ret = -EINVAL; + goto out; + } + + if (kvm_update_pi_irte_common(kvm, vcpu, irq.vector, + host_irq)) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + ret = -EINVAL; + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + +int kvm_update_pi_irte(struct kvm *kvm, u32 dev_id) +{ + int i, rc = -1; + struct kvm_assigned_dev_kernel *dev; + + mutex_lock(&kvm->lock); + dev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, dev_id); + if (!dev) { + printk(KERN_INFO "%s: cannot find the assigned dev.\n", + __func__); + rc = -1; + goto out; + } + + BUG_ON(dev->irq_requested_type == 0); + + if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) && + (dev->dev->msi_enabled == 1)) { + __kvm_update_pi_irte(kvm, + dev->host_irq, dev->guest_irq); + } else if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) && + (dev->dev->msix_enabled == 1)) { + for (i = 0; i < dev->entries_nr; i++) { + __kvm_update_pi_irte(kvm, + dev->host_msix_entries[i].vector, + dev->guest_msix_entries[i].vector); + } + } + +out: + rc = 0; + mutex_unlock(&kvm->lock); + return rc; +} + static int assigned_device_enable_host_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -1017,6 +1146,18 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); break; } + case KVM_ASSIGN_DEV_PI_UPDATE: { + u32 dev_id; + + r = -EFAULT; + if (copy_from_user(&dev_id, argp, sizeof(dev_id))) + goto out; + r = kvm_update_pi_irte(kvm, dev_id); + if (r) + goto out; + break; + + } default: r = -ENOTTY; break; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 963b899..f51aed3 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -55,7 +55,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, line_status); } -inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) +bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) { #ifdef CONFIG_IA64 return irq->delivery_mode == @@ -106,7 +106,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 7f256f3..cdf29a6 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,17 +31,6 @@ #include <trace/events/kvm.h> #include "irq.h" -struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; - struct kvm_kernel_irq_routing_entry *rt_entries; - u32 nr_rt_entries; - /* - * Array indexed by gsi. Each entry contains list of irq chips - * the gsi is connected to. - */ - struct hlist_head map[0]; -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html