From: Wei Liu <wei.liu@xxxxxxxxxx> Sent: Wednesday, February 3, 2021 7:05 AM > > When Linux runs as the root partition on Microsoft Hypervisor, its > interrupts are remapped. Linux will need to explicitly map and unmap > interrupts for hardware. > > Implement an MSI domain to issue the correct hypercalls. And initialize > this irqdomain as the default MSI irq domain. > > Signed-off-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> > Co-Developed-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> > Signed-off-by: Wei Liu <wei.liu@xxxxxxxxxx> > --- > v6: > 1. Use u64 status. > 2. Use vpset instead of bitmap. > 3. Factor out hv_map_interrupt > 4. Address other misc comments. > > v4: Fix compilation issue when CONFIG_PCI_MSI is not set. > v3: build irqdomain.o for 32bit as well. > v2: This patch is simplified due to upstream changes. > --- > arch/x86/hyperv/Makefile | 2 +- > arch/x86/hyperv/hv_init.c | 9 + > arch/x86/hyperv/irqdomain.c | 362 ++++++++++++++++++++++++++++++++ > arch/x86/include/asm/mshyperv.h | 2 + > 4 files changed, 374 insertions(+), 1 deletion(-) > create mode 100644 arch/x86/hyperv/irqdomain.c > > diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile > index 565358020921..48e2c51464e8 100644 > --- a/arch/x86/hyperv/Makefile > +++ b/arch/x86/hyperv/Makefile > @@ -1,5 +1,5 @@ > # SPDX-License-Identifier: GPL-2.0-only > -obj-y := hv_init.o mmu.o nested.o > +obj-y := hv_init.o mmu.o nested.o irqdomain.o > obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o > > ifdef CONFIG_X86_64 > diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c > index 11c5997691f4..894ce899f0cb 100644 > --- a/arch/x86/hyperv/hv_init.c > +++ b/arch/x86/hyperv/hv_init.c > @@ -483,6 +483,15 @@ void __init hyperv_init(void) > > BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); > > +#ifdef CONFIG_PCI_MSI > + /* > + * If we're running as root, we want to create our own PCI MSI domain. > + * We can't set this in hv_pci_init because that would be too late. > + */ > + if (hv_root_partition) > + x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; > +#endif > + > return; > > remove_cpuhp_state: > diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c > new file mode 100644 > index 000000000000..117f17e8c88a > --- /dev/null > +++ b/arch/x86/hyperv/irqdomain.c > @@ -0,0 +1,362 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +/* > + * for Linux to run as the root partition on Microsoft Hypervisor. Nit: Looks like the initial word "Irqdomain" got dropped from the above comment line. But don't respin just for this. > + * > + * Authors: > + * Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> > + * Wei Liu <wei.liu@xxxxxxxxxx> > + */ > + > +#include <linux/pci.h> > +#include <linux/irq.h> > +#include <asm/mshyperv.h> > + > +static int hv_map_interrupt(union hv_device_id device_id, bool level, > + int cpu, int vector, struct hv_interrupt_entry *entry) > +{ > + struct hv_input_map_device_interrupt *input; > + struct hv_output_map_device_interrupt *output; > + struct hv_device_interrupt_descriptor *intr_desc; > + unsigned long flags; > + u64 status; > + cpumask_t mask = CPU_MASK_NONE; > + int nr_bank, var_size; > + > + local_irq_save(flags); > + > + input = *this_cpu_ptr(hyperv_pcpu_input_arg); > + output = *this_cpu_ptr(hyperv_pcpu_output_arg); > + > + intr_desc = &input->interrupt_descriptor; > + memset(input, 0, sizeof(*input)); > + input->partition_id = hv_current_partition_id; > + input->device_id = device_id.as_uint64; > + intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; > + intr_desc->vector_count = 1; > + intr_desc->target.vector = vector; > + > + if (level) > + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; > + else > + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; > + > + cpumask_set_cpu(cpu, &mask); > + intr_desc->target.vp_set.valid_bank_mask = 0; > + intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; > + nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), &mask); There's a function get_cpu_mask() that returns a pointer to a cpumask with only the specified cpu set in the mask. It returns a const pointer to the correct entry in a pre-allocated array of all such cpumasks, so it's a lot more efficient than allocating and initializing a local cpumask instance on the stack. > + if (nr_bank < 0) { > + local_irq_restore(flags); > + pr_err("%s: unable to generate VP set\n", __func__); > + return EINVAL; > + } > + intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; > + > + /* > + * var-sized hypercall, var-size starts after vp_mask (thus > + * vp_set.format does not count, but vp_set.valid_bank_mask > + * does). > + */ > + var_size = nr_bank + 1; > + > + status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, > + input, output); > + *entry = output->interrupt_entry; > + > + local_irq_restore(flags); > + > + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) > + pr_err("%s: hypercall failed, status %lld\n", __func__, status); > + > + return status & HV_HYPERCALL_RESULT_MASK; > +} > + > +static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) > +{ > + unsigned long flags; > + struct hv_input_unmap_device_interrupt *input; > + struct hv_interrupt_entry *intr_entry; > + u64 status; > + > + local_irq_save(flags); > + input = *this_cpu_ptr(hyperv_pcpu_input_arg); > + > + memset(input, 0, sizeof(*input)); > + intr_entry = &input->interrupt_entry; > + input->partition_id = hv_current_partition_id; > + input->device_id = id; > + *intr_entry = *old_entry; > + > + status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); > + local_irq_restore(flags); > + > + return status & HV_HYPERCALL_RESULT_MASK; > +} > + > +#ifdef CONFIG_PCI_MSI > +struct rid_data { > + struct pci_dev *bridge; > + u32 rid; > +}; > + > +static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) > +{ > + struct rid_data *rd = data; > + u8 bus = PCI_BUS_NUM(rd->rid); > + > + if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { > + rd->bridge = pdev; > + rd->rid = alias; > + } > + > + return 0; > +} > + > +static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) > +{ > + union hv_device_id dev_id; > + struct rid_data data = { > + .bridge = NULL, > + .rid = PCI_DEVID(dev->bus->number, dev->devfn) > + }; > + > + pci_for_each_dma_alias(dev, get_rid_cb, &data); > + > + dev_id.as_uint64 = 0; > + dev_id.device_type = HV_DEVICE_TYPE_PCI; > + dev_id.pci.segment = pci_domain_nr(dev->bus); > + > + dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid); > + dev_id.pci.bdf.device = PCI_SLOT(data.rid); > + dev_id.pci.bdf.function = PCI_FUNC(data.rid); > + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE; > + > + if (data.bridge) { > + int pos; > + > + /* > + * Microsoft Hypervisor requires a bus range when the bridge is > + * running in PCI-X mode. > + * > + * To distinguish conventional vs PCI-X bridge, we can check > + * the bridge's PCI-X Secondary Status Register, Secondary Bus > + * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge > + * Specification Revision 1.0 5.2.2.1.3. > + * > + * Value zero means it is in conventional mode, otherwise it is > + * in PCI-X mode. > + */ > + > + pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); > + if (pos) { > + u16 status; > + > + pci_read_config_word(data.bridge, pos + > + PCI_X_BRIDGE_SSTATUS, &status); > + > + if (status & PCI_X_SSTATUS_FREQ) { > + /* Non-zero, PCI-X mode */ > + u8 sec_bus, sub_bus; > + > + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; > + > + pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus); > + dev_id.pci.shadow_bus_range.secondary_bus = sec_bus; > + pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus); > + dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus; > + } > + } > + } > + > + return dev_id; > +} > + > +static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector, > + struct hv_interrupt_entry *entry) > +{ > + union hv_device_id device_id = hv_build_pci_dev_id(dev); > + > + return hv_map_interrupt(device_id, false, cpu, vector, entry); > +} > + > +static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) > +{ > + /* High address is always 0 */ > + msg->address_hi = 0; > + msg->address_lo = entry->msi_entry.address.as_uint32; > + msg->data = entry->msi_entry.data.as_uint32; > +} > + > +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); > +static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) > +{ > + struct msi_desc *msidesc; > + struct pci_dev *dev; > + struct hv_interrupt_entry out_entry, *stored_entry; > + struct irq_cfg *cfg = irqd_cfg(data); > + cpumask_t *affinity; > + int cpu; > + u64 status; > + > + msidesc = irq_data_get_msi_desc(data); > + dev = msi_desc_to_pci_dev(msidesc); > + > + if (!cfg) { > + pr_debug("%s: cfg is NULL", __func__); > + return; > + } > + > + affinity = irq_data_get_effective_affinity_mask(data); > + cpu = cpumask_first_and(affinity, cpu_online_mask); > + > + if (data->chip_data) { > + /* > + * This interrupt is already mapped. Let's unmap first. > + * > + * We don't use retarget interrupt hypercalls here because > + * Microsoft Hypervisor doens't allow root to change the vector > + * or specify VPs outside of the set that is initially used > + * during mapping. > + */ > + stored_entry = data->chip_data; > + data->chip_data = NULL; > + > + status = hv_unmap_msi_interrupt(dev, stored_entry); > + > + kfree(stored_entry); > + > + if (status != HV_STATUS_SUCCESS) { > + pr_debug("%s: failed to unmap, status %lld", __func__, status); > + return; > + } > + } > + > + stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); > + if (!stored_entry) { > + pr_debug("%s: failed to allocate chip data\n", __func__); > + return; > + } > + > + status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry); > + if (status != HV_STATUS_SUCCESS) { > + kfree(stored_entry); > + return; > + } > + > + *stored_entry = out_entry; > + data->chip_data = stored_entry; > + entry_to_msi_msg(&out_entry, msg); > + > + return; > +} > + > +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry) > +{ > + return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); > +} > + > +static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq) > +{ > + u64 status; > + struct hv_interrupt_entry old_entry; > + struct irq_desc *desc; > + struct irq_data *data; > + struct msi_msg msg; > + > + desc = irq_to_desc(irq); > + if (!desc) { > + pr_debug("%s: no irq desc\n", __func__); > + return; > + } > + > + data = &desc->irq_data; > + if (!data) { > + pr_debug("%s: no irq data\n", __func__); > + return; > + } > + > + if (!data->chip_data) { > + pr_debug("%s: no chip data\n!", __func__); > + return; > + } > + > + old_entry = *(struct hv_interrupt_entry *)data->chip_data; > + entry_to_msi_msg(&old_entry, &msg); > + > + kfree(data->chip_data); > + data->chip_data = NULL; > + > + status = hv_unmap_msi_interrupt(dev, &old_entry); > + > + if (status != HV_STATUS_SUCCESS) { > + pr_err("%s: hypercall failed, status %lld\n", __func__, status); > + return; > + } > +} > + > +static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) > +{ > + int i; > + struct msi_desc *entry; > + struct pci_dev *pdev; > + > + if (WARN_ON_ONCE(!dev_is_pci(dev))) > + return; > + > + pdev = to_pci_dev(dev); > + > + for_each_pci_msi_entry(entry, pdev) { > + if (entry->irq) { > + for (i = 0; i < entry->nvec_used; i++) { > + hv_teardown_msi_irq_common(pdev, entry, entry->irq + i); > + irq_domain_free_irqs(entry->irq + i, 1); > + } > + } > + } > +} > + > +/* > + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, > + * which implement the MSI or MSI-X Capability Structure. > + */ > +static struct irq_chip hv_pci_msi_controller = { > + .name = "HV-PCI-MSI", > + .irq_unmask = pci_msi_unmask_irq, > + .irq_mask = pci_msi_mask_irq, > + .irq_ack = irq_chip_ack_parent, > + .irq_retrigger = irq_chip_retrigger_hierarchy, > + .irq_compose_msi_msg = hv_irq_compose_msi_msg, > + .irq_set_affinity = msi_domain_set_affinity, > + .flags = IRQCHIP_SKIP_SET_WAKE, > +}; > + > +static struct msi_domain_ops pci_msi_domain_ops = { > + .domain_free_irqs = hv_msi_domain_free_irqs, > + .msi_prepare = pci_msi_prepare, > +}; > + > +static struct msi_domain_info hv_pci_msi_domain_info = { > + .flags = MSI_FLAG_USE_DEF_DOM_OPS | > MSI_FLAG_USE_DEF_CHIP_OPS | > + MSI_FLAG_PCI_MSIX, > + .ops = &pci_msi_domain_ops, > + .chip = &hv_pci_msi_controller, > + .handler = handle_edge_irq, > + .handler_name = "edge", > +}; > + > +struct irq_domain * __init hv_create_pci_msi_domain(void) > +{ > + struct irq_domain *d = NULL; > + struct fwnode_handle *fn; > + > + fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); > + if (fn) > + d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); > + > + /* No point in going further if we can't get an irq domain */ > + BUG_ON(!d); > + > + return d; > +} > + > +#endif /* CONFIG_PCI_MSI */ > diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h > index cbee72550a12..ccc849e25d5e 100644 > --- a/arch/x86/include/asm/mshyperv.h > +++ b/arch/x86/include/asm/mshyperv.h > @@ -261,6 +261,8 @@ static inline void hv_set_msi_entry_from_desc(union hv_msi_entry > *msi_entry, > msi_entry->data.as_uint32 = msi_desc->msg.data; > } > > +struct irq_domain *hv_create_pci_msi_domain(void); > + > #else /* CONFIG_HYPERV */ > static inline void hyperv_init(void) {} > static inline void hyperv_setup_mmu_ops(void) {} > -- > 2.20.1