On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote: > While trying to get device passthrough working with an emulex hba, kvm > refused to pass it through because it has a BAR of 256 bytes: > > Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K] > Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256] > Region 4: I/O ports at b100 [size=256] > > Since the page boundary is an arbitrary optimization to allow 1:1 mapping of > physical to virtual addresses, we can still take the old MMIO callback route. > > So let's add a second code path that allows for size & 0xFFF != 0 sized regions > by looping it through userspace. > > I verified that it works by passing through an e1000 with this additional patch > applied and the card acted the same way it did without this patch: > > map_func = assigned_dev_iomem_map; > - if (cur_region->size & 0xFFF) { > + if (i != PCI_ROM_SLOT){ > fprintf(stderr, "PCI region %d at address 0x%llx " The patch is pretty clean. However, I think I see a bug below. Did you also try a device with sub-page BAR? If yes, did the bar get non page aligned value? > Signed-off-by: Alexander Graf <agraf@xxxxxxx> > > --- > > v1 -> v2: > > - don't use map_func function pointer > - use the same code for mmap on fast and slow path > --- > hw/device-assignment.c | 123 +++++++++++++++++++++++++++++++++++++++++++++--- > 1 files changed, 116 insertions(+), 7 deletions(-) > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c > index 13a86bb..5cee929 100644 > --- a/hw/device-assignment.c > +++ b/hw/device-assignment.c > @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) > return value; > } > > +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr); this is a void* pointer, no need to cast > + uint32_t r = -1; don't initialize r here as you override 1 line below. > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; > +} > + > +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr); > + uint32_t r = -1; > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; same as above > +} > + > +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr); > + uint32_t r = -1; > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; same as above > +} > + > +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr); no need for cast as r_virtbase is a void pointer. > + > + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); > + *out = val; > +} > + > +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr); > + > + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); > + *out = val; > +} > + > +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr); same as above > + > + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); > + *out = val; > +} > + > +static CPUWriteMemoryFunc * const slow_bar_write[] = { > + &slow_bar_writeb, > + &slow_bar_writew, > + &slow_bar_writel > +}; > + > +static CPUReadMemoryFunc * const slow_bar_read[] = { > + &slow_bar_readb, > + &slow_bar_readw, > + &slow_bar_readl > +}; > + > +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num, > + pcibus_t e_phys, pcibus_t e_size, > + int type) > +{ > + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); > + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; > + int m; > + > + DEBUG("slow map\n"); > + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region); > + cpu_register_physical_memory(e_phys, e_size, m); > + > + /* MSI-X MMIO page */ some code duplication ... use a common function? > + if ((e_size > 0) && > + real_region->base_addr <= r_dev->msix_table_addr && > + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) { > + int offset = r_dev->msix_table_addr - real_region->base_addr; > + > + cpu_register_physical_memory(e_phys + offset, > + TARGET_PAGE_SIZE, r_dev->mmio_index); How does this work? Does the last registered callback win or are both called for MSIX page? > + } > +} > + > static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > pcibus_t e_phys, pcibus_t e_size, int type) > { > @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > > /* handle memory io regions */ > if (cur_region->type & IORESOURCE_MEM) { > + int slow_map = 0; > int t = cur_region->type & IORESOURCE_PREFETCH > ? PCI_BASE_ADDRESS_MEM_PREFETCH > : PCI_BASE_ADDRESS_SPACE_MEMORY; > + > if (cur_region->size & 0xFFF) { > - fprintf(stderr, "Unable to assign device: PCI region %d " > - "at address 0x%llx has size 0x%x, " > - " which is not a multiple of 4K\n", > + fprintf(stderr, "PCI region %d at address 0x%llx " > + "has size 0x%x, which is not a multiple of 4K. " > + "Using slow map\n", I still think "Using slow map" tells the user nothing. How about "Disabling direct guest access, device will be slow"? > i, (unsigned long long)cur_region->base_addr, > cur_region->size); > + slow_map = 1; > + } > + > + if (slow_map && (i == PCI_ROM_SLOT)) { > + fprintf(stderr, "ROM not aligned - can't continue\n"); > return -1; > } > > @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > } else { > pci_dev->v_addrs[i].u.r_virtbase = > mmap(NULL, > - (cur_region->size + 0xFFF) & 0xFFFFF000, > + cur_region->size, > PROT_WRITE | PROT_READ, MAP_SHARED, > cur_region->resource_fd, (off_t) 0); > } > @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > pci_dev->v_addrs[i].e_size = 0; > > /* add offset */ > - pci_dev->v_addrs[i].u.r_virtbase += > - (cur_region->base_addr & 0xFFF); > + if (!slow_map) { > + pci_dev->v_addrs[i].u.r_virtbase += > + (cur_region->base_addr & 0xFFF); > + } This looks wrong. I think mmap returns a page aligned address, that's why we did this offset math. No? And if not, there was no reason for this code in the first place. > > pci_register_bar((PCIDevice *) pci_dev, i, > cur_region->size, t, > - assigned_dev_iomem_map); > + slow_map ? assigned_dev_iomem_map_slow > + : assigned_dev_iomem_map); > continue; > } > /* handle port io regions */ > -- > 1.6.0.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html