On Tue, 2012-08-14 at 18:53 +0300, Avi Kivity wrote: > On 08/01/2012 08:18 AM, Alex Williamson wrote: > > This adds the core of the QEMU VFIO-based PCI device assignment driver. > > To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1, > > and CONFIG_VFIO_PCI in your host Linux kernel config. Load the vfio-pci > > module. To assign device 0000:05:00.0 to a guest, do the following: > > > > for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do > > vendor=$(cat /sys/bus/pci/devices/$dev/vendor) > > device=$(cat /sys/bus/pci/devices/$dev/device) > > if [ -e /sys/bus/pci/devices/$dev/driver ]; then > > echo $dev > /sys/bus/pci/devices/$dev/driver/unbind > > fi > > echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id > > done > > > > See Documentation/vfio.txt in the Linux kernel tree for further > > description of IOMMU groups and VFIO. > > > > Then launch qemu including the option: > > > > -device vfio-pci,host=0000:05:00.0 > > > > Support for legacy PCI interrupts (INTx) is not yet included and will > > be added in a future update. Both MSI and MSI-X are supported here. > > > > + > > +static void vfio_update_irq(PCIDevice *pdev) > > +{ > > + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); > > + PCIINTxRoute route; > > + > > + if (vdev->interrupt != INT_INTx) { > > + return; > > + } > > + > > + route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); > > + if (!memcmp(&route, &vdev->intx.route, sizeof(route))) { > > + return; /* Nothing changed */ > > + } > > You can't memcmp() structures, the compiler may add uninitialized holes > that will miscompare. It's probably harmless here since it's an > optimization. Added this helper function: /* TODO: Move this helper out to generic PCI code */ static bool vfio_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new) { return old->mode != new->mode || old->irq != new->irq; } > Unrelated nit: memcmp() doesn't return a boolean or a count, so > !memcmp() is really unintuitive, at least to me. I figure we're all pretty used to it growing up on !strcmp though. > > + > > +static int vfio_enable_intx(VFIODevice *vdev) > > +{ > > + struct vfio_irq_set_fd irq_set_fd = { > > + .irq_set = { > > + .argsz = sizeof(irq_set_fd), > > + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, > > + .index = VFIO_PCI_INTX_IRQ_INDEX, > > + .start = 0, > > + .count = 1, > > + }, > > + }; > > + uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); > > + > > + if (!pin) { > > + return 0; > > + } > > + > > + vfio_disable_interrupts(vdev); > > + > > + vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ > > + vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, > > + vdev->intx.pin); > > + /* TBD - Enable QEMU eoi notifier */ > > + > > + if (event_notifier_init(&vdev->intx.interrupt, 0)) { > > + error_report("vfio: Error: event_notifier_init failed\n"); > > + return -1; > > return -error is better. Here we probably want to return the return value of event_notifier_init(), but there are lots of cases where we could return -errno. Fixed them. > > + } > > + > > + irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt); > > + qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev); > > + > > + if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) { > > + error_report("vfio: Error: Failed to setup INTx fd: %s\n", > > + strerror(errno)); > > + return -1; > > + } > > + > > + vfio_enable_intx_kvm(vdev); > > + > > + vdev->interrupt = INT_INTx; > > + > > + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, > > + vdev->host.bus, vdev->host.slot, vdev->host.function); > > + > > + return 0; > > +} > > + > > + > > + > > +/* XXX This should move to msi.c */ > > Well? Just marking a todo item. I'll change it formally to TODO. I think there are a few interfaces to msi.c that probably needs some rethinking for device assignment. When they're small like this it seems easier to have the user in tree first. > > > +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector) > > +{ > > + uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS); > > + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; > > + MSIMessage msg; > > + > > + if (msi64bit) { > > + msg.address = pci_get_quad(pdev->config + > > + pdev->msi_cap + PCI_MSI_ADDRESS_LO); > > + } else { > > + msg.address = pci_get_long(pdev->config + > > + pdev->msi_cap + PCI_MSI_ADDRESS_LO); > > + } > > + > > + msg.data = pci_get_word(pdev->config + pdev->msi_cap + > > + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32)); > > + msg.data += vector; > > + > > + return msg; > > +} > > + > > + > > +/* > > + * IO Port/MMIO - Beware of the endians, VFIO is always little endian > > + */ > > +static void vfio_bar_write(void *opaque, target_phys_addr_t addr, > > + uint64_t data, unsigned size) > > +{ > > + VFIOBAR *bar = opaque; > > + uint8_t buf[8]; > > + > > + switch (size) { > > + case 1: > > + *buf = data & 0xff; > > + break; > > + case 2: > > + *(uint16_t *)buf = cpu_to_le16(data); > > + break; > > + case 4: > > + *(uint32_t *)buf = cpu_to_le32(data); > > + break; > > This works accidentally on machines that require alignment, since > there's no requirement from the compiler to align buf. You can use a > union to align it. Good catch, fixed. > > + default: > > + hw_error("vfio: unsupported write size, %d bytes\n", size); > > + break; > > + } > > + > > + if (pwrite(bar->fd, buf, size, bar->fd_offset + addr) != size) { > > + error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n", > > + __func__, addr, data, size, strerror(errno)); > > + } > > + > > + DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n", > > + __func__, bar->nr, addr, data, size); > > +} > > + > > + > > +static void vfio_listener_region_add(MemoryListener *listener, > > + MemoryRegionSection *section) > > +{ > > + VFIOContainer *container = container_of(listener, VFIOContainer, > > + iommu_data.listener); > > + target_phys_addr_t iova, end; > > + void *vaddr; > > + int ret; > > + > > + if (vfio_listener_skipped_section(section)) { > > + DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n", > > + section->offset_within_address_space, > > + section->offset_within_address_space + section->size - 1); > > + return; > > + } > > + > > + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != > > + (section->offset_within_region & ~TARGET_PAGE_MASK))) { > > + error_report("%s received unaligned region\n", __func__); > > Is it really an error? I think you can just add the condition to > skipped_section. I had left this in as paranoia for myself that I wanted to see if this actually happens. I want to assume that our TARGET_PAGE_ALIGNED offset_within_address_space results in an aligned ram pointer. If one is aligned different from the other we're kinda screwed trying to map it into the iommu. So far I haven't seen it. Thanks for the feedback, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html