Michael S. Tsirkin wrote: > On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote: > >> While trying to get device passthrough working with an emulex hba, kvm >> refused to pass it through because it has a BAR of 256 bytes: >> >> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K] >> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256] >> Region 4: I/O ports at b100 [size=256] >> >> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of >> physical to virtual addresses, we can still take the old MMIO callback route. >> >> So let's add a second code path that allows for size & 0xFFF != 0 sized regions >> by looping it through userspace. >> >> I verified that it works by passing through an e1000 with this additional patch >> applied and the card acted the same way it did without this patch: >> >> map_func = assigned_dev_iomem_map; >> - if (cur_region->size & 0xFFF) { >> + if (i != PCI_ROM_SLOT){ >> fprintf(stderr, "PCI region %d at address 0x%llx " >> > > The patch is pretty clean. However, I think I see a bug below. > Did you also try a device with sub-page BAR? > If yes, did the bar get non page aligned value? > > >> Signed-off-by: Alexander Graf <agraf@xxxxxxx> >> >> --- >> >> v1 -> v2: >> >> - don't use map_func function pointer >> - use the same code for mmap on fast and slow path >> --- >> hw/device-assignment.c | 123 +++++++++++++++++++++++++++++++++++++++++++++--- >> 1 files changed, 116 insertions(+), 7 deletions(-) >> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c >> index 13a86bb..5cee929 100644 >> --- a/hw/device-assignment.c >> +++ b/hw/device-assignment.c >> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) >> return value; >> } >> >> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr); >> > > this is a void* pointer, no need to cast > > >> + uint32_t r = -1; >> > > don't initialize r here as you override 1 line below. > >> + >> + r = *in; >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); >> + >> + return r; >> +} >> + >> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr); >> + uint32_t r = -1; >> + >> + r = *in; >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); >> + >> + return r; >> > > same as above > > >> +} >> + >> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr); >> + uint32_t r = -1; >> + >> + r = *in; >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); >> + >> + return r; >> > > same as above > > >> +} >> + >> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr); >> > > no need for cast as r_virtbase is a void pointer. > > >> + >> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); >> + *out = val; >> +} >> + >> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr); >> + >> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); >> + *out = val; >> +} >> + >> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) >> +{ >> + AssignedDevRegion *d = opaque; >> + uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr); >> > > same as above > > >> + >> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); >> + *out = val; >> +} >> + >> +static CPUWriteMemoryFunc * const slow_bar_write[] = { >> + &slow_bar_writeb, >> + &slow_bar_writew, >> + &slow_bar_writel >> +}; >> + >> +static CPUReadMemoryFunc * const slow_bar_read[] = { >> + &slow_bar_readb, >> + &slow_bar_readw, >> + &slow_bar_readl >> +}; >> + >> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num, >> + pcibus_t e_phys, pcibus_t e_size, >> + int type) >> +{ >> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); >> + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; >> + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; >> + int m; >> + >> + DEBUG("slow map\n"); >> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region); >> + cpu_register_physical_memory(e_phys, e_size, m); >> + >> + /* MSI-X MMIO page */ >> > > some code duplication ... use a common function? > I'm not sure. It's pretty little duplication. If you feel like this should be merged, feel free to make a cleanup patch afterwards :-). > >> + if ((e_size > 0) && >> + real_region->base_addr <= r_dev->msix_table_addr && >> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) { >> + int offset = r_dev->msix_table_addr - real_region->base_addr; >> + >> + cpu_register_physical_memory(e_phys + offset, >> + TARGET_PAGE_SIZE, r_dev->mmio_index); >> > > How does this work? Does the last registered callback win > or are both called for MSIX page? > In qemu memory allocations the last one always wins. > >> + } >> +} >> + >> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, >> pcibus_t e_phys, pcibus_t e_size, int type) >> { >> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, >> >> /* handle memory io regions */ >> if (cur_region->type & IORESOURCE_MEM) { >> + int slow_map = 0; >> int t = cur_region->type & IORESOURCE_PREFETCH >> ? PCI_BASE_ADDRESS_MEM_PREFETCH >> : PCI_BASE_ADDRESS_SPACE_MEMORY; >> + >> if (cur_region->size & 0xFFF) { >> - fprintf(stderr, "Unable to assign device: PCI region %d " >> - "at address 0x%llx has size 0x%x, " >> - " which is not a multiple of 4K\n", >> + fprintf(stderr, "PCI region %d at address 0x%llx " >> + "has size 0x%x, which is not a multiple of 4K. " >> + "Using slow map\n", >> > > I still think "Using slow map" tells the user nothing. > How about "Disabling direct guest access, device will be slow"? > Changed. > >> i, (unsigned long long)cur_region->base_addr, >> cur_region->size); >> + slow_map = 1; >> + } >> + >> + if (slow_map && (i == PCI_ROM_SLOT)) { >> + fprintf(stderr, "ROM not aligned - can't continue\n"); >> return -1; >> } >> >> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, >> } else { >> pci_dev->v_addrs[i].u.r_virtbase = >> mmap(NULL, >> - (cur_region->size + 0xFFF) & 0xFFFFF000, >> + cur_region->size, >> PROT_WRITE | PROT_READ, MAP_SHARED, >> cur_region->resource_fd, (off_t) 0); >> } >> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, >> pci_dev->v_addrs[i].e_size = 0; >> >> /* add offset */ >> - pci_dev->v_addrs[i].u.r_virtbase += >> - (cur_region->base_addr & 0xFFF); >> + if (!slow_map) { >> + pci_dev->v_addrs[i].u.r_virtbase += >> + (cur_region->base_addr & 0xFFF); >> + } >> > > This looks wrong. I think mmap returns a page aligned address, > that's why we did this offset math. No? > And if not, there was no reason for this code in the first place. > Yeah, I guess you're right. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html