On Tue, Dec 15, 2009 at 08:04:10PM +0100, Alexander Graf wrote: > Michael S. Tsirkin wrote: > > On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote: > > > >> Michael S. Tsirkin wrote: > >> > >>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote: > >>> > >>> > >>>> While trying to get device passthrough working with an emulex hba, kvm > >>>> refused to pass it through because it has a BAR of 256 bytes: > >>>> > >>>> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K] > >>>> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256] > >>>> Region 4: I/O ports at b100 [size=256] > >>>> > >>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of > >>>> physical to virtual addresses, we can still take the old MMIO callback route. > >>>> > >>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions > >>>> by looping it through userspace. > >>>> > >>>> I verified that it works by passing through an e1000 with this additional patch > >>>> applied and the card acted the same way it did without this patch: > >>>> > >>>> map_func = assigned_dev_iomem_map; > >>>> - if (cur_region->size & 0xFFF) { > >>>> + if (i != PCI_ROM_SLOT){ > >>>> fprintf(stderr, "PCI region %d at address 0x%llx " > >>>> > >>>> Signed-off-by: Alexander Graf <agraf@xxxxxxx> > >>>> > >>>> > >>> Looks good. > >>> 1 question: > >>> > >>> > >>> > >>>> --- > >>>> > >>>> v1 -> v2: > >>>> > >>>> - don't use map_func function pointer > >>>> - use the same code for mmap on fast and slow path > >>>> > >>>> v2 -> v3: > >>>> > >>>> - address mst's comments > >>>> --- > >>>> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++-- > >>>> 1 files changed, 112 insertions(+), 5 deletions(-) > >>>> > >>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c > >>>> index 13a86bb..000fa61 100644 > >>>> --- a/hw/device-assignment.c > >>>> +++ b/hw/device-assignment.c > >>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) > >>>> return value; > >>>> } > >>>> > >>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint8_t *in = d->u.r_virtbase + addr; > >>>> + uint32_t r; > >>>> + > >>>> + r = *in; > >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >>>> + > >>>> + return r; > >>>> +} > >>>> + > >>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint16_t *in = d->u.r_virtbase + addr; > >>>> + uint32_t r; > >>>> + > >>>> + r = *in; > >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >>>> + > >>>> + return r; > >>>> +} > >>>> + > >>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint32_t *in = d->u.r_virtbase + addr; > >>>> + uint32_t r; > >>>> + > >>>> + r = *in; > >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); > >>>> + > >>>> + return r; > >>>> +} > >>>> + > >>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint8_t *out = d->u.r_virtbase + addr; > >>>> + > >>>> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); > >>>> + *out = val; > >>>> +} > >>>> + > >>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint16_t *out = d->u.r_virtbase + addr; > >>>> + > >>>> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); > >>>> + *out = val; > >>>> +} > >>>> + > >>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > >>>> +{ > >>>> + AssignedDevRegion *d = opaque; > >>>> + uint32_t *out = d->u.r_virtbase + addr; > >>>> + > >>>> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); > >>>> + *out = val; > >>>> +} > >>>> + > >>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = { > >>>> + &slow_bar_writeb, > >>>> + &slow_bar_writew, > >>>> + &slow_bar_writel > >>>> +}; > >>>> + > >>>> +static CPUReadMemoryFunc * const slow_bar_read[] = { > >>>> + &slow_bar_readb, > >>>> + &slow_bar_readw, > >>>> + &slow_bar_readl > >>>> +}; > >>>> + > >>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num, > >>>> + pcibus_t e_phys, pcibus_t e_size, > >>>> + int type) > >>>> +{ > >>>> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); > >>>> + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > >>>> + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; > >>>> + int m; > >>>> + > >>>> + DEBUG("slow map\n"); > >>>> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region); > >>>> + cpu_register_physical_memory(e_phys, e_size, m); > >>>> + > >>>> + /* MSI-X MMIO page */ > >>>> + if ((e_size > 0) && > >>>> + real_region->base_addr <= r_dev->msix_table_addr && > >>>> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) { > >>>> + int offset = r_dev->msix_table_addr - real_region->base_addr; > >>>> + > >>>> + cpu_register_physical_memory(e_phys + offset, > >>>> + TARGET_PAGE_SIZE, r_dev->mmio_index); > >>>> + } > >>>> +} > >>>> + > >>>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > >>>> pcibus_t e_phys, pcibus_t e_size, int type) > >>>> { > >>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > >>>> > >>>> /* handle memory io regions */ > >>>> if (cur_region->type & IORESOURCE_MEM) { > >>>> + int slow_map = 0; > >>>> int t = cur_region->type & IORESOURCE_PREFETCH > >>>> ? PCI_BASE_ADDRESS_MEM_PREFETCH > >>>> : PCI_BASE_ADDRESS_SPACE_MEMORY; > >>>> + > >>>> if (cur_region->size & 0xFFF) { > >>>> - fprintf(stderr, "Unable to assign device: PCI region %d " > >>>> - "at address 0x%llx has size 0x%x, " > >>>> - " which is not a multiple of 4K\n", > >>>> + fprintf(stderr, "PCI region %d at address 0x%llx " > >>>> + "has size 0x%x, which is not a multiple of 4K. " > >>>> + "You might experience some performance hit due to that.\n", > >>>> i, (unsigned long long)cur_region->base_addr, > >>>> cur_region->size); > >>>> + slow_map = 1; > >>>> + } > >>>> + > >>>> + if (slow_map && (i == PCI_ROM_SLOT)) { > >>>> + fprintf(stderr, "ROM not aligned - can't continue\n"); > >>>> return -1; > >>>> > >>>> > >>> Why is this? Can't ROM be handled in the same way? > >>> I think it was previously ... > >>> > >>> > >> You can't execute code from MMIO regions, so slow_map doesn't work there. > >> > > > > Hmm, how does it work with passthrough? > > Anyway, I don't think a lot of users run ROM > > code directly, they just shadow it and run from there. > > > > > >>> > >>> > >>>> } > >>>> > >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions, > >>>> } else { > >>>> pci_dev->v_addrs[i].u.r_virtbase = > >>>> mmap(NULL, > >>>> - (cur_region->size + 0xFFF) & 0xFFFFF000, > >>>> + cur_region->size, > >>>> > >>>> > >>> Hmm, one assumes this code did work at some point ... > >>> do you know why was it done this way? > >>> > >>> > >> Nope :-). > >> > > > > So maybe keep it that way or make the change in a separate patch. > > > > Well we're sure the normal path is size aligned, no? In fact, you > recommended I should change the code to what this looks like ;-). > > Alex Existing code first page aligned address, passes this to mmap, then adds page offset. I do not know why. Could be e.g. to handle different libc/kernel versions. If we are not sure it is not needed, let's keep it this way. If we are sure, let's cleanup in a separate patch. Makes sense? -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html