On Tue, Dec 15, 2009 at 07:16:13PM +0100, Alexander Graf wrote: > Michael S. Tsirkin wrote: > > On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote: > > > >> While trying to get device passthrough working with an emulex hba, kvm > >> refused to pass it through because it has a BAR of 256 bytes: > >> > >> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K] > >> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256] > >> Region 4: I/O ports at b100 [size=256] > >> > >> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of > >> physical to virtual addresses, we can still take the old MMIO callback route. > >> > >> So let's add a second code path that allows for size & 0xFFF != 0 sized regions > >> by looping it through userspace. > >> > >> I verified that it works by passing through an e1000 with this additional patch > >> applied and the card acted the same way it did without this patch: > >> > >> map_func = assigned_dev_iomem_map; > >> - if (cur_region->size & 0xFFF) { > >> + if (i != PCI_ROM_SLOT){ > >> fprintf(stderr, "PCI region %d at address 0x%llx " > >> > > > > The patch is pretty clean. However, I think I see a bug below. > > Did you also try a device with sub-page BAR? > > If yes, did the bar get non page aligned value? > > > > > >> Signed-off-by: Alexander Graf <agraf@xxxxxxx> > >> > >> --- > >> > >> v1 -> v2: > >> > >> - don't use map_func function pointer > >> - use the same code for mmap on fast and slow path > >> --- > >> hw/device-assignment.c | 123 +++++++++++++++++++++++++++++++++++++++++++++--- > >> 1 files changed, 116 insertions(+), 7 deletions(-) > >> > >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c > >> index 13a86bb..5cee929 100644 > >> --- a/hw/device-assignment.c > >> +++ b/hw/device-assignment.c > >> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) > >> return value; > >> } > >> > >> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr); > >> > > > > this is a void* pointer, no need to cast > > > > > >> + uint32_t r = -1; > >> > > > > don't initialize r here as you override 1 line below. > > > >> + > >> + r = *in; > >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >> + > >> + return r; > >> +} > >> + > >> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr); > >> + uint32_t r = -1; > >> + > >> + r = *in; > >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >> + > >> + return r; > >> > > > > same as above > > > > > >> +} > >> + > >> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr); > >> + uint32_t r = -1; > >> + > >> + r = *in; > >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >> + > >> + return r; > >> > > > > same as above > > > > > >> +} > >> + > >> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr); > >> > > > > no need for cast as r_virtbase is a void pointer. > > > > > >> + > >> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); > >> + *out = val; > >> +} > >> + > >> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr); > >> + > >> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); > >> + *out = val; > >> +} > >> + > >> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > >> +{ > >> + AssignedDevRegion *d = opaque; > >> + uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr); > >> > > > > same as above > > > > > >> + > >> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); > >> + *out = val; > >> +} > >> + > >> +static CPUWriteMemoryFunc * const slow_bar_write[] = { > >> + &slow_bar_writeb, > >> + &slow_bar_writew, > >> + &slow_bar_writel > >> +}; > >> + > >> +static CPUReadMemoryFunc * const slow_bar_read[] = { > >> + &slow_bar_readb, > >> + &slow_bar_readw, > >> + &slow_bar_readl > >> +}; > >> + > >> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num, > >> + pcibus_t e_phys, pcibus_t e_size, > >> + int type) > >> +{ > >> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); > >> + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > >> + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; > >> + int m; > >> + > >> + DEBUG("slow map\n"); > >> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region); > >> + cpu_register_physical_memory(e_phys, e_size, m); > >> + > >> + /* MSI-X MMIO page */ > >> > > > > some code duplication ... use a common function? > > > > I'm not sure. It's pretty little duplication. If you feel like this > should be merged, feel free to make a cleanup patch afterwards :-). > > > > >> + if ((e_size > 0) && > >> + real_region->base_addr <= r_dev->msix_table_addr && > >> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) { > >> + int offset = r_dev->msix_table_addr - real_region->base_addr; > >> + > >> + cpu_register_physical_memory(e_phys + offset, > >> + TARGET_PAGE_SIZE, r_dev->mmio_index); > >> > > > > How does this work? Does the last registered callback win > > or are both called for MSIX page? > > > > In qemu memory allocations the last one always wins. > > > > >> + } > >> +} > >> + > >> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > >> pcibus_t e_phys, pcibus_t e_size, int type) > >> { > >> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > >> > >> /* handle memory io regions */ > >> if (cur_region->type & IORESOURCE_MEM) { > >> + int slow_map = 0; > >> int t = cur_region->type & IORESOURCE_PREFETCH > >> ? PCI_BASE_ADDRESS_MEM_PREFETCH > >> : PCI_BASE_ADDRESS_SPACE_MEMORY; > >> + > >> if (cur_region->size & 0xFFF) { > >> - fprintf(stderr, "Unable to assign device: PCI region %d " > >> - "at address 0x%llx has size 0x%x, " > >> - " which is not a multiple of 4K\n", > >> + fprintf(stderr, "PCI region %d at address 0x%llx " > >> + "has size 0x%x, which is not a multiple of 4K. " > >> + "Using slow map\n", > >> > > > > I still think "Using slow map" tells the user nothing. > > How about "Disabling direct guest access, device will be slow"? > > > > Changed. > > > > >> i, (unsigned long long)cur_region->base_addr, > >> cur_region->size); > >> + slow_map = 1; > >> + } > >> + > >> + if (slow_map && (i == PCI_ROM_SLOT)) { > >> + fprintf(stderr, "ROM not aligned - can't continue\n"); > >> return -1; > >> } > >> > >> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > >> } else { > >> pci_dev->v_addrs[i].u.r_virtbase = > >> mmap(NULL, > >> - (cur_region->size + 0xFFF) & 0xFFFFF000, > >> + cur_region->size, > >> PROT_WRITE | PROT_READ, MAP_SHARED, > >> cur_region->resource_fd, (off_t) 0); > >> } > >> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > >> pci_dev->v_addrs[i].e_size = 0; > >> > >> /* add offset */ > >> - pci_dev->v_addrs[i].u.r_virtbase += > >> - (cur_region->base_addr & 0xFFF); > >> + if (!slow_map) { > >> + pci_dev->v_addrs[i].u.r_virtbase += > >> + (cur_region->base_addr & 0xFFF); > >> + } > >> > > > > This looks wrong. I think mmap returns a page aligned address, > > that's why we did this offset math. No? > > And if not, there was no reason for this code in the first place. > > > > Yeah, I guess you're right. > > Alex I guess this means you'll have to find a device with a sub-page BAR to test this on, instead of hacking driver for a device with full page BAR. Did anyone ever try doing passthrough on an emulated device in nested virt? -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html