Re: [PATCH] Enable non page boundary BAR device assignment

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Fri, 11 Dec 2009 13:05:35 +0200

On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
> 
>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>         Region 4: I/O ports at b100 [size=256]
> 
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
> 
> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> by looping it through userspace.
> 
> I verified that it works by passing through an e1000 with this additional patch
> applied and the card acted the same way it did without this patch:
> 
>              map_func = assigned_dev_iomem_map;
> -            if (cur_region->size & 0xFFF) {
> +            if (i != PCI_ROM_SLOT){
>                  fprintf(stderr, "PCI region %d at address 0x%llx "

The patch is pretty clean. However, I think I see a bug below.
Did you also try a device with sub-page BAR?
If yes, did the bar get non page aligned value?

> Signed-off-by: Alexander Graf <agraf@xxxxxxx>
> 
> ---
> 
> v1 -> v2:
> 
>   - don't use map_func function pointer
>   - use the same code for mmap on fast and slow path
> ---
>  hw/device-assignment.c |  123 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 116 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..5cee929 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>      return value;
>  }
>  
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);

this is a void* pointer, no need to cast

> +    uint32_t r = -1;

don't initialize r here as you override 1 line below.
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;

same as above

> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;

same as above

> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);

no need for cast as r_virtbase is a void pointer.

> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);

same as above

> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> +    &slow_bar_writeb,
> +    &slow_bar_writew,
> +    &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> +    &slow_bar_readb,
> +    &slow_bar_readw,
> +    &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> +                                        pcibus_t e_phys, pcibus_t e_size,
> +                                        int type)
> +{
> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +    int m;
> +
> +    DEBUG("slow map\n");
> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> +    cpu_register_physical_memory(e_phys, e_size, m);
> +
> +    /* MSI-X MMIO page */

some code duplication  ... use a common function?

> +    if ((e_size > 0) &&
> +        real_region->base_addr <= r_dev->msix_table_addr &&
> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +        cpu_register_physical_memory(e_phys + offset,
> +                TARGET_PAGE_SIZE, r_dev->mmio_index);

How does this work? Does the last registered callback win
or are both called for MSIX page?

> +    }
> +}
> +
>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>  {
> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>  
>          /* handle memory io regions */
>          if (cur_region->type & IORESOURCE_MEM) {
> +            int slow_map = 0;
>              int t = cur_region->type & IORESOURCE_PREFETCH
>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
>              if (cur_region->size & 0xFFF) {
> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> -                        "at address 0x%llx has size 0x%x, "
> -                        " which is not a multiple of 4K\n",
> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "Using slow map\n",

I still think "Using slow map" tells the user nothing.
How about "Disabling direct guest access, device will be slow"?

>                          i, (unsigned long long)cur_region->base_addr,
>                          cur_region->size);
> +                slow_map = 1;
> +            }
> +
> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>                  return -1;
>              }
>  
> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              } else {
>                  pci_dev->v_addrs[i].u.r_virtbase =
>                      mmap(NULL,
> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> +                         cur_region->size,
>                           PROT_WRITE | PROT_READ, MAP_SHARED,
>                           cur_region->resource_fd, (off_t) 0);
>              }
> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              pci_dev->v_addrs[i].e_size = 0;
>  
>              /* add offset */
> -            pci_dev->v_addrs[i].u.r_virtbase +=
> -                (cur_region->base_addr & 0xFFF);
> +            if (!slow_map) {
> +                pci_dev->v_addrs[i].u.r_virtbase +=
> +                    (cur_region->base_addr & 0xFFF);
> +            }

This looks wrong. I think mmap returns a page aligned address,
that's why we did this offset math. No?
And if not, there was no reason for this code in the first place.

>  
>              pci_register_bar((PCIDevice *) pci_dev, i,
>                               cur_region->size, t,
> -                             assigned_dev_iomem_map);
> +                             slow_map ? assigned_dev_iomem_map_slow
> +                                      : assigned_dev_iomem_map);
>              continue;
>          }
>          /* handle port io regions */
> -- 
> 1.6.0.2
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html