On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote: > MSI interrupt affinity setting on the guest ended always up on vcpu0, > no matter what. > IOW writes to /proc/irq/<IRQ>/smp_affinity are irgnored. > This patch fixes the MSI IRQ routing and avoids the utter madness of > tearing down and setting up the interrupt completely when this changes. > > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Signed-off-by: Richard Weinberger <richard@xxxxxx> > --- > hw/device-assignment.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 70 insertions(+), 3 deletions(-) > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c > index 09726f9..78d57c8 100644 > --- a/hw/device-assignment.c > +++ b/hw/device-assignment.c > @@ -913,6 +913,50 @@ void assigned_dev_update_irqs(void) > } > } > > +static void assigned_dev_update_msi_route(PCIDevice *pci_dev) > +{ > + AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev); > + uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap + > + PCI_MSI_FLAGS); > + struct kvm_irq_routing_entry *old, new; > + KVMMsiMessage msg; > + int r; Please follow qemu coding style for braces throughout. > + > + if (!(ctrl_byte & PCI_MSI_FLAGS_ENABLE)) > + return; > + > + msg.addr_lo = pci_get_long(pci_dev->config + pci_dev->msi_cap + > + PCI_MSI_ADDRESS_LO); > + msg.addr_hi = pci_get_long(pci_dev->config + pci_dev->msi_cap + > + PCI_MSI_ADDRESS_HI); Odd, since we only expose a 32bit MSI capability to the guest... > + msg.data = pci_get_long(pci_dev->config + pci_dev->msi_cap + > + PCI_MSI_DATA_32); Should be pci_get_word() > + > + old = adev->entry; > + new = *old; > + new.u.msi.address_lo = msg.addr_lo; > + new.u.msi.address_hi = msg.addr_hi; > + new.u.msi.data = msg.data; > + > + if (memcmp(old, &new, sizeof(new)) == 0) > + return; > + > + r = kvm_update_routing_entry(old, &new); How does this work? old is now new, so kvm_update_routing_entry() is never going to match to the existing entry if address_lo or data actually change. > + if (r < 0) { > + fprintf(stderr, "%s: kvm_update_msi failed: %s\n", __func__, > + strerror(-r)); > + exit(1); > + } > + > + *old = new; huh? > + r = kvm_irqchip_commit_routes(kvm_state); > + if (r) { > + fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__, > + strerror(-r)); > + exit(1); > + } > +} > + > static void assigned_dev_update_msi(PCIDevice *pci_dev) > { > struct kvm_assigned_irq assigned_irq_data; > @@ -1116,6 +1160,14 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, > uint32_t virt_val = pci_default_read_config(pci_dev, address, len); > uint32_t real_val, emulate_mask, full_emulation_mask; > > + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { > + uint32_t msi_start = pci_dev->msi_cap; > + uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3; > + > + if (address >= msi_start && (address + len) < msi_end) ranges_overlap() is meant for this. We only expose a 32bit MSI cap, so msi_end is wrong. > + return virt_val; > + } > + > emulate_mask = 0; > memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len); > emulate_mask = le32_to_cpu(emulate_mask); > @@ -1130,6 +1182,17 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, > } > } > > +static void handle_cfg_write_msi(PCIDevice *pci_dev, AssignedDevice *adev) > +{ > + if (!kvm_enabled() || !kvm_irqchip_in_kernel()) > + return; Unnecessary, device assignment doesn't work otherwise. > + > + if (adev->entry && (adev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI)) Should just be able to test irq_requested_type. > + assigned_dev_update_msi_route(pci_dev); > + else > + assigned_dev_update_msi(pci_dev); > +} > + > static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address, > uint32_t val, int len) > { > @@ -1155,9 +1218,13 @@ static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address, > } > } > if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { > - if (range_covers_byte(address, len, > - pci_dev->msi_cap + PCI_MSI_FLAGS)) { > - assigned_dev_update_msi(pci_dev); > + uint32_t msi_start = pci_dev->msi_cap; > + uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3; > + > + if (address >= msi_start && (address + len) < msi_end) { Use ranges_overlap() please, msi_end is wrong. > + if (address == msi_start + PCI_MSI_DATA_32) > + handle_cfg_write_msi(pci_dev, assigned_dev); Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap + PCI_MSI_DATA_32) to start with? But how does this handle the enable bit? > + return; > } > } > if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html