On Sun, 2 Aug 2015, jakeo@xxxxxxxxxxxxx wrote: > +#include <linux/kernel.h> > +#include <linux/jiffies.h> > +#include <linux/mman.h> > +#include <linux/delay.h> > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/slab.h> > +#include <linux/kthread.h> > +#include <linux/completion.h> > +#include <linux/notifier.h> > +#include <linux/of_pci.h> > +#include <linux/of_platform.h> > +#include <linux/pci.h> > +#include <linux/semaphore.h> > +#include <linux/irqdomain.h> > +#include <asm/irqdomain.h> > +#include <linux/msi.h> I seriously doubt, that you need all of these includes. > +/* Interrupt management hooks */ > + > +/** > + * hv_msi_free() - Free the MSI. > + * @domain: The interrupt domain pointer > + * @info: Extra MSI-related context > + * @virq: Identifies the IRQ. > + * > + * The Hyper-V parent partition and hypervisor are tracking the > + * messages that are in use, keeping the interrupt redirection > + * table up to date. This callback sends a message that frees > + * the the IRT entry and related tracking nonsense. > + */ > +void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, > + unsigned int virq) static? > +{ > + struct pci_delete_interrupt *int_pkt; > + struct { > + struct pci_packet pkt; > + u8 buffer[sizeof(struct pci_delete_interrupt) - > + sizeof(struct pci_message)]; > + } ctxt; > + struct hv_pcibus_device *hbus; > + struct hv_pci_dev *hpdev; > + struct irq_desc *desc; > + struct msi_desc *msi; > + struct tran_int_desc *int_desc; > + struct irq_desc *irq_desc; > + > + desc = irq_to_desc(virq); > + msi = irq_desc_get_msi_desc(desc); Why do you need to lookup the irq descriptor if you want the msi descriptor? > + hbus = info->data; > + hpdev = lookup_hv_dev(hbus, devfn_to_wslot(msi->dev->devfn)); > + if (!hpdev) > + return; > + > + int_desc = irq_get_handler_data(virq); I don't think this is a proper storage point. The data is domain/chip specific, right? So, what's wrong with storing it in irq_data::chip_data? > + if (int_desc) { > + memset(&ctxt, 0, sizeof(ctxt)); > + int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; > + int_pkt->message_type.message_type = > + PCI_DELETE_INTERRUPT_MESSAGE; > + int_pkt->wslot.slot = hpdev->desc.win_slot.slot; > + int_pkt->int_desc = *int_desc; > + vmbus_sendpacket(hbus->hdev->channel, int_pkt, sizeof(*int_pkt), > + (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, > + 0); > + kfree(int_desc); Free before clearing the reference? > + irq_desc = irq_to_desc(virq); Do you expect the descriptor to change between the lookup above and this one? > + irq_desc->irq_data.handler_data = NULL; You are not supposed to fiddle in irq_desc or irq_data internals. We have helper functions for that. > + } > + > + hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot); > +} > + > +int hv_set_affinity(struct irq_data *data, const struct cpumask *dest, > + bool force) static, if at all. > +{ > + struct irq_data *parent = data->parent_data; > + > + return parent->chip->irq_set_affinity(parent, dest, force); > +} irq_chip_set_affinity_parent ??? > +/** > + * hv_compose_msi_msg() - Supplies a valid MSI address/data > + * @data: Everything about this MSI > + * @msg: Buffer that is filled in by this function > + * > + * This function unpacks the IRQ looking for target CPU set, IDT > + * vector and mode and sends a message to the parent partition > + * asking for a mapping for that tuple in this partition. The > + * response supplies a data value and address to which that data > + * should be written to trigger that interrupt. > + */ > +void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) static ? > +{ > + struct irq_cfg *cfg = irqd_cfg(data); > + struct hv_pcibus_device *hbus; > + struct hv_pci_dev *hpdev; > + struct pci_bus *pbus; > + struct pci_create_interrupt *int_pkt; > + struct compose_comp_ctxt comp; > + struct tran_int_desc *int_desc; > + struct irq_desc *irq_desc; > + struct { > + struct pci_packet pkt; > + u8 buffer[sizeof(struct pci_create_interrupt) - > + sizeof(struct pci_message)]; > + } ctxt; > + int cpu; > + int ret; > + > + pbus = data->msi_desc->dev->bus; > + hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); > + hpdev = lookup_hv_dev(hbus, devfn_to_wslot(data->msi_desc->dev->devfn)); > + > + if (!hpdev) > + goto return_null_message; > + > + int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL); > + if (!int_desc) > + goto return_null_message; > + > + memset(&ctxt, 0, sizeof(ctxt)); > + init_completion(&comp.comp_pkt.host_event); > + ctxt.pkt.completion_func = hv_pci_compose_compl; > + ctxt.pkt.compl_ctxt = ∁ > + int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message; > + int_pkt->message_type.message_type = PCI_CREATE_INTERRUPT_MESSAGE; > + int_pkt->wslot.slot = hpdev->desc.win_slot.slot; > + int_pkt->int_desc.vector = cfg->vector; > + int_pkt->int_desc.vector_count = 1; > + int_pkt->int_desc.delivery_mode = > + (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0; > + > + /* > + * Cut down interrupt targets to a single processor. Hyper-V > + * currently only supports picking the target of an MSI by > + * sending a message to the parent partition and receiving a > + * response. This can't really work in the context of > + * irq_set_affinity, which sometimes needs to run in an environment > + * that can't wait for a completion. Until a future version of > + * Hyper-V exists that can retarget an interrupt without waiting, > + * this code picks a single virtual processor as the target. This > + * comment and the following lines should be deleted when that > + * happens. > + */ > + cpu = cpumask_any_and(data->affinity, cpu_online_mask); We have accessors for this for a reason. data->affinity will be gone after 4.3-rc1. > + cpumask_clear(data->affinity); > + cpumask_set_cpu(cpu, data->affinity); > + /* > + * This bit doesn't have to work on machines with more than 64 > + * processors because Hyper-V only supports 64 in a guest. > + */ > + for_each_cpu(cpu, data->affinity) { > + if (cpu_is_offline(cpu)) > + continue; > + int_pkt->int_desc.cpu_mask |= > + (1 << vmbus_cpu_number_to_vp_number(cpu)); > + } ROTFL. What's the point of this loop? To find the single already known cpu which is in the mask? > + ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, > + sizeof(*int_pkt), (unsigned long)&ctxt.pkt, > + VM_PKT_DATA_INBAND, > + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); > + if (!ret) > + wait_for_completion(&comp.comp_pkt.host_event); > + > + if (comp.comp_pkt.completion_status < 0) { > + pr_err("Request for interrupt failed: 0x%x", > + comp.comp_pkt.completion_status); > + goto return_null_message; > + } > + > + /* > + * Record the assignment so that this can be unwound later. Using > + * irq_set_handler_data() here would be appropriate, but the lock > + * it takes is already held. > + */ > + *int_desc = comp.int_desc; > + irq_desc = irq_to_desc(data->irq); > + irq_desc->irq_data.handler_data = int_desc; You're really a fan of redundant lookups. How is irq_desc->irq_data going to be different from data? > + /* Pass up the result. */ > + msg->address_hi = comp.int_desc.address >> 32; > + msg->address_lo = comp.int_desc.address & 0xffffffff; > + msg->data = comp.int_desc.data; > + > + hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot); > + return; > + > +return_null_message: Leaks int_desc and a refcount on hpdev. > + msg->address_hi = 0; > + msg->address_lo = 0; > + msg->data = 0; > +} > + > +/* HW Interrupt Chip Descriptor */ > +static struct irq_chip hv_msi_irq_chip = { > + .name = "Hyper-V PCIe MSI", > + .irq_compose_msi_msg = hv_compose_msi_msg, > + .irq_set_affinity = hv_set_affinity, > + .irq_ack = irq_chip_ack_parent, Please format this readable: .name = "Hyper-V PCIe MSI", .irq_compose_msi_msg = hv_compose_msi_msg, .irq_set_affinity = hv_set_affinity, .irq_ack = irq_chip_ack_parent, Hmm? > +}; > + > +static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, > + msi_alloc_info_t *arg) > +{ > + return arg->msi_hwirq; > +} > + > +static int hv_msi_domain_ops_prepare(struct irq_domain *domain, > + struct device *dev, int nvec, > + msi_alloc_info_t *arg) > +{ > + struct pci_dev *pdev = to_pci_dev(dev); > + struct msi_desc *desc = first_pci_msi_entry(pdev); > + > + memset(arg, 0, sizeof(*arg)); > + arg->msi_dev = pdev; > + if (desc->msi_attrib.is_msix) { > + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; > + } else { > + arg->type = X86_IRQ_ALLOC_TYPE_MSI; > + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; > + } > + > + return 0; > +} Pretty much a copy of the x86 code, right? I wonder whether this MSI infrastructure code would be better seperated out and moved to arch/x86/hyperv/msi.c or arch/x86/kernel/apic/hvmsi.c. It's small enough to be built in. So all you'd need to export is hv_pcie_init_irq_domain and hv_pcie_free_irq_domain. Thanks, tglx -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html