From: Jeffrey Hugo <quic_jhugo@xxxxxxxxxxx> commit a2bad844a67b1c7740bda63e87453baf63c3a7f7 upstream. According to Dexuan, the hypervisor folks beleive that multi-msi allocations are not correct. compose_msi_msg() will allocate multi-msi one by one. However, multi-msi is a block of related MSIs, with alignment requirements. In order for the hypervisor to allocate properly aligned and consecutive entries in the IOMMU Interrupt Remapping Table, there should be a single mapping request that requests all of the multi-msi vectors in one shot. Dexuan suggests detecting the multi-msi case and composing a single request related to the first MSI. Then for the other MSIs in the same block, use the cached information. This appears to be viable, so do it. 4.14 backport - file moved to host/pci-hyperv.c. add hv_msi_get_int_vector helper function. Fixed merge conflict due to delivery_mode name change (APIC_DELIVERY_MODE_FIXED is the value given to dest_Fixed). Removed unused variable in hv_compose_msi_msg. Fixed reference to msi_desc->pci to point to the same is_msix variable. Removed changes to compose_msi_req_v3 since it doesn't exist yet. Added "reason" to put_pcichild (unused in function). Suggested-by: Dexuan Cui <decui@xxxxxxxxxxxxx> Signed-off-by: Jeffrey Hugo <quic_jhugo@xxxxxxxxxxx> Reviewed-by: Dexuan Cui <decui@xxxxxxxxxxxxx> Tested-by: Michael Kelley <mikelley@xxxxxxxxxxxxx> Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@xxxxxxxxxxx Signed-off-by: Wei Liu <wei.liu@xxxxxxxxxx> Signed-off-by: Carl Vanderlip <quic_carlv@xxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/pci/host/pci-hyperv.c | 62 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 8 deletions(-) --- a/drivers/pci/host/pci-hyperv.c +++ b/drivers/pci/host/pci-hyperv.c @@ -846,6 +846,11 @@ static void hv_int_desc_free(struct hv_p u8 buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; + if (!int_desc->vector_count) { + kfree(int_desc); + return; + } + memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = @@ -908,6 +913,13 @@ static void hv_irq_mask(struct irq_data pci_msi_mask_irq(data); } +static unsigned int hv_msi_get_int_vector(struct irq_data *data) +{ + struct irq_cfg *cfg = irqd_cfg(data); + + return cfg->vector; +} + static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *info) { @@ -1050,12 +1062,12 @@ static void hv_pci_compose_compl(void *c static u32 hv_compose_msi_req_v1( struct pci_create_interrupt *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = (apic->irq_delivery_mode == dest_LowestPrio) ? dest_LowestPrio : dest_Fixed; @@ -1071,14 +1083,14 @@ static u32 hv_compose_msi_req_v1( static u32 hv_compose_msi_req_v2( struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int cpu; int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = (apic->irq_delivery_mode == dest_LowestPrio) ? dest_LowestPrio : dest_Fixed; @@ -1108,7 +1120,6 @@ static u32 hv_compose_msi_req_v2( */ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); struct hv_pcibus_device *hbus; struct hv_pci_dev *hpdev; struct pci_bus *pbus; @@ -1117,6 +1128,8 @@ static void hv_compose_msi_msg(struct ir unsigned long flags; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; + struct msi_desc *msi_desc; + u8 vector, vector_count; struct { struct pci_packet pci_pkt; union { @@ -1137,7 +1150,8 @@ static void hv_compose_msi_msg(struct ir return; } - pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); + msi_desc = irq_data_get_msi_desc(data); + pdev = msi_desc_to_pci_dev(msi_desc); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); @@ -1149,6 +1163,36 @@ static void hv_compose_msi_msg(struct ir if (!int_desc) goto drop_reference; + if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) { + /* + * If this is not the first MSI of Multi MSI, we already have + * a mapping. Can exit early. + */ + if (msi_desc->irq != data->irq) { + data->chip_data = int_desc; + int_desc->address = msi_desc->msg.address_lo | + (u64)msi_desc->msg.address_hi << 32; + int_desc->data = msi_desc->msg.data + + (data->irq - msi_desc->irq); + msg->address_hi = msi_desc->msg.address_hi; + msg->address_lo = msi_desc->msg.address_lo; + msg->data = int_desc->data; + put_pcichild(hpdev, hv_pcidev_ref_by_slot); + return; + } + /* + * The vector we select here is a dummy value. The correct + * value gets sent to the hypervisor in unmask(). This needs + * to be aligned with the count, and also not zero. Multi-msi + * is powers of 2 up to 32, so 32 will always work here. + */ + vector = 32; + vector_count = msi_desc->nvec_used; + } else { + vector = hv_msi_get_int_vector(data); + vector_count = 1; + } + memset(&ctxt, 0, sizeof(ctxt)); init_completion(&comp.comp_pkt.host_event); ctxt.pci_pkt.completion_func = hv_pci_compose_compl; @@ -1159,14 +1203,16 @@ static void hv_compose_msi_msg(struct ir size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, dest, hpdev->desc.win_slot.slot, - cfg->vector); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_2: size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, dest, hpdev->desc.win_slot.slot, - cfg->vector); + vector, + vector_count); break; default: