On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote: > While trying to get device passthrough working with an emulex hba, kvm > refused to pass it through because it has a BAR of 256 bytes: > > Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K] > Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256] > Region 4: I/O ports at b100 [size=256] > > Since the page boundary is an arbitrary optimization to allow 1:1 mapping of > physical to virtual addresses, we can still take the old MMIO callback route. > > So let's add a second code path that allows for size & 0xFFF != 0 sized regions > by looping it through userspace. > > I verified that it works by passing through an e1000 with this additional patch > applied and the card acted the same way it did without this patch: > > map_func = assigned_dev_iomem_map; > - if (cur_region->size & 0xFFF) { > + if (i != PCI_ROM_SLOT){ > fprintf(stderr, "PCI region %d at address 0x%llx " > > Signed-off-by: Alexander Graf <agraf@xxxxxxx> Good idea. One thing I am concerned with, is that some users might see performance degradation and not see the error message. Maybe we can have a flag to optionally fail unless passthrough can be fast? I'm not really sure it's worth it to add such a flag though - what do others think? Minor nits below. > --- > hw/device-assignment.c | 129 +++++++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 123 insertions(+), 6 deletions(-) > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c > index 13a86bb..beb488c 100644 > --- a/hw/device-assignment.c > +++ b/hw/device-assignment.c > @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) > return value; > } > > +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr); > + uint32_t r = -1; > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; > +} > + > +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr); > + uint32_t r = -1; > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; > +} > + > +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) > +{ > + AssignedDevRegion *d = opaque; > + uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr); > + uint32_t r = -1; > + > + r = *in; > + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > + > + return r; > +} > + > +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr); > + > + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); > + *out = val; > +} > + > +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr); > + > + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); > + *out = val; > +} > + > +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > +{ > + AssignedDevRegion *d = opaque; > + uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr); > + > + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); > + *out = val; > +} > + > +static CPUWriteMemoryFunc * const slow_bar_write[] = { > + &slow_bar_writeb, > + &slow_bar_writew, > + &slow_bar_writel > +}; > + > +static CPUReadMemoryFunc * const slow_bar_read[] = { > + &slow_bar_readb, > + &slow_bar_readw, > + &slow_bar_readl > +}; > + > +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num, > + pcibus_t e_phys, pcibus_t e_size, > + int type) > +{ > + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); > + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; > + int m; > + > + DEBUG("slow map\n"); > + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region); > + cpu_register_physical_memory(e_phys, e_size, m); > + > + /* MSI-X MMIO page */ > + if ((e_size > 0) && > + real_region->base_addr <= r_dev->msix_table_addr && > + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) { > + int offset = r_dev->msix_table_addr - real_region->base_addr; > + > + cpu_register_physical_memory(e_phys + offset, > + TARGET_PAGE_SIZE, r_dev->mmio_index); > + } > +} > + > static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > pcibus_t e_phys, pcibus_t e_size, int type) > { > @@ -381,15 +480,25 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > > /* handle memory io regions */ > if (cur_region->type & IORESOURCE_MEM) { > + void (*map_func)(PCIDevice *, int, pcibus_t, pcibus_t, int); > + int slow_map = 0; This is a bit too clever for my taste. Let's just do below pci_register_bar((PCIDevice *) pci_dev, i, cur_region->size, t, slow_map ? assigned_dev_iomem_map : assigned_dev_iomem_map_slow); (by the way, this cast to (PCIDevice *) is also unnecessary, isn't, it? > int t = cur_region->type & IORESOURCE_PREFETCH > ? PCI_BASE_ADDRESS_MEM_PREFETCH > : PCI_BASE_ADDRESS_SPACE_MEMORY; > + > + map_func = assigned_dev_iomem_map; > if (cur_region->size & 0xFFF) { > - fprintf(stderr, "Unable to assign device: PCI region %d " > - "at address 0x%llx has size 0x%x, " > - " which is not a multiple of 4K\n", > + fprintf(stderr, "PCI region %d at address 0x%llx " > + "has size 0x%x, which is not a multiple of 4K. " > + "Using slow map\n", Maybe replace "Using slow map" with "The device will be slow.". > i, (unsigned long long)cur_region->base_addr, > cur_region->size); > + map_func = assigned_dev_iomem_map_slow; > + slow_map = 1; > + } > + > + if (slow_map && (i == PCI_ROM_SLOT)) { > + fprintf(stderr, "ROM not aligned - can't continue\n"); > return -1; > } > > @@ -402,6 +511,12 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, > 0, (off_t) 0); > > + } else if (slow_map) { > + pci_dev->v_addrs[i].u.r_virtbase = > + mmap(NULL, > + cur_region->size, > + PROT_WRITE | PROT_READ, MAP_SHARED, > + cur_region->resource_fd, (off_t) 0); Isn't this the same as the case below it? I expect mmap to act identically for slow and fast ... Just replace (cur_region->size + 0xFFF) & 0xFFFFF000, with cur_region->size. > } else { > pci_dev->v_addrs[i].u.r_virtbase = > mmap(NULL, > @@ -429,12 +544,14 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > pci_dev->v_addrs[i].e_size = 0; > > /* add offset */ > - pci_dev->v_addrs[i].u.r_virtbase += > - (cur_region->base_addr & 0xFFF); > + if (!slow_map) { > + pci_dev->v_addrs[i].u.r_virtbase += > + (cur_region->base_addr & 0xFFF); > + } > > pci_register_bar((PCIDevice *) pci_dev, i, > cur_region->size, t, > - assigned_dev_iomem_map); > + map_func); > continue; > } > /* handle port io regions */ > -- > 1.6.0.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html