Re: [PATCH 12/13] pci-assign: Generic config space access management

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jun 27, 2011 at 08:19:55PM +0200, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> 
> This drastically simplifies config space access management: Instead of
> coding various range checks and merging bits, set up two access control
> bitmaps. One defines, which bits can be directly read from the device,
> the other allows direct write to the device, also with bit-granularity.
> 
> The setup of those bitmaps, specifically the corner cases like the
> capability list status bit, is way more readable than the existing code.
> And the generic config access functions just need one additional logic
> to catch writes to the MSI/MSI-X control flags and call the related
> update handlers.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> ---
>  hw/device-assignment.c |  325 ++++++++++++++++-------------------------------
>  hw/device-assignment.h |    3 +-
>  2 files changed, 113 insertions(+), 215 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 6a2ca8c..be199d2 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -63,35 +63,6 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev);
>  
>  static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
>  
> -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> -                                                 uint32_t address,
> -                                                 uint32_t val, int len);
> -
> -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> -                                                    uint32_t address, int len);
> -
> -/* Merge the bits set in mask from mval into val.  Both val and mval are
> - * at the same addr offset, pos is the starting offset of the mask. */
> -static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr,
> -                           int len, uint8_t pos, uint32_t mask)
> -{
> -    if (!ranges_overlap(addr, len, pos, 4)) {
> -        return val;
> -    }
> -
> -    if (addr >= pos) {
> -        mask >>= (addr - pos) * 8;
> -    } else {
> -        mask <<= (pos - addr) * 8;
> -    }
> -    mask &= 0xffffffffU >> (4 - len) * 8;
> -
> -    val &= ~mask;
> -    val |= (mval & mask);
> -
> -    return val;
> -}
> -
>  static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
>                                         uint32_t addr, int len, uint32_t *val)
>  {
> @@ -404,143 +375,6 @@ static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
>      return 0;
>  }
>  
> -static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
> -                                          uint32_t val, int len)
> -{
> -    int fd;
> -    ssize_t ret;
> -    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> -
> -    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> -          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> -          (uint16_t) address, val, len);
> -
> -    if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
> -        return assigned_device_pci_cap_write_config(d, address, val, len);
> -    }
> -
> -    if (ranges_overlap(address, len, PCI_COMMAND, 2)) {
> -        pci_default_write_config(d, address, val, len);
> -        /* Continue to program the card */
> -    }
> -
> -    /*
> -     * Catch access to
> -     *  - base address registers
> -     *  - ROM base address & capability pointer
> -     *  - interrupt line & pin
> -     */
> -    if (ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) ||
> -        ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) {
> -        pci_default_write_config(d, address, val, len);
> -        return;
> -    } else if (ranges_overlap(address, len, PCI_CAPABILITY_LIST, 1) ||
> -               ranges_overlap(address, len, PCI_INTERRUPT_LINE, 2)) {
> -        uint32_t real_val;
> -
> -        pci_default_write_config(d, address, val, len);
> -
> -        /* Ensure that writes to overlapping areas we don't virtualize still
> -         * hit the device. */
> -        real_val = assigned_dev_pci_read(d, address, len);
> -        val = merge_bits(val, real_val, address, len,
> -                         PCI_CAPABILITY_LIST, 0xff);
> -        val = merge_bits(val, real_val, address, len,
> -                         PCI_INTERRUPT_LINE, 0xffff);
> -    }
> -
> -    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
> -          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> -          (uint16_t) address, val, len);
> -
> -    fd = pci_dev->real_device.config_fd;
> -
> -again:
> -    ret = pwrite(fd, &val, len, address);
> -    if (ret != len) {
> -	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> -	    goto again;
> -
> -	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> -		__func__, ret, errno);
> -
> -	exit(1);
> -    }
> -}
> -
> -static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
> -                                             int len)
> -{
> -    uint32_t val = 0, virt_val;
> -    int fd;
> -    ssize_t ret;
> -    AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d);
> -
> -    if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
> -        val = assigned_device_pci_cap_read_config(d, address, len);
> -        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> -              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> -        return val;
> -    }
> -
> -    /*
> -     * Catch access to
> -     *  - vendor & device ID
> -     *  - base address registers
> -     *  - ROM base address
> -     */
> -    if (ranges_overlap(address, len, PCI_VENDOR_ID, 4) ||
> -        ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) ||
> -        ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) {
> -        val = pci_default_read_config(d, address, len);
> -        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> -              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> -        return val;
> -    }
> -
> -    fd = pci_dev->real_device.config_fd;
> -
> -again:
> -    ret = pread(fd, &val, len, address);
> -    if (ret != len) {
> -	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> -	    goto again;
> -
> -	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
> -		__func__, ret, errno);
> -
> -	exit(1);
> -    }
> -
> -    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> -          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> -
> -    if (pci_dev->emulate_cmd_mask) {
> -        val = merge_bits(val, pci_default_read_config(d, address, len),
> -                         address, len, PCI_COMMAND, pci_dev->emulate_cmd_mask);
> -    }
> -
> -    /*
> -     * Merge bits from virtualized
> -     *  - capability pointer
> -     *  - interrupt line & pin
> -     */
> -    virt_val = pci_default_read_config(d, address, len);
> -    val = merge_bits(val, virt_val, address, len, PCI_CAPABILITY_LIST, 0xff);
> -    val = merge_bits(val, virt_val, address, len, PCI_INTERRUPT_LINE, 0xffff);
> -
> -    if (!pci_dev->cap.available) {
> -        /* kill the special capabilities */
> -        if (address == PCI_COMMAND && len == 4) {
> -            val &= ~(PCI_STATUS_CAP_LIST << 16);
> -        } else if (address == PCI_STATUS) {
> -            val &= ~PCI_STATUS_CAP_LIST;
> -        }
> -    }
> -
> -    return val;
> -}
> -
>  static int assigned_dev_register_regions(PCIRegion *io_regions,
>                                           unsigned long regions_num,
>                                           AssignedDevice *pci_dev)
> @@ -790,7 +624,8 @@ again:
>      /* dealing with virtual function device */
>      snprintf(name, sizeof(name), "%sphysfn/", dir);
>      if (!stat(name, &statbuf)) {
> -        pci_dev->emulate_cmd_mask = 0xffff;
> +        /* always provide the written value on readout */
> +        memset(pci_dev->direct_config_read + PCI_COMMAND, 0xff, 2);
>      }
>  
>      dev->region_number = r;
> @@ -1268,73 +1103,81 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev)
>      }
>  }
>  
> -/* There can be multiple VNDR capabilities per device, we need to find the
> - * one that starts closet to the given address without going over. */
> -static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address)
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
> +                                             uint32_t address, int len)
>  {
> -    uint8_t cap, pos;
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
> +    uint32_t real_val, direct_mask;
>  
> -    for (cap = pos = 0;
> -         (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
> -         pos += PCI_CAP_LIST_NEXT) {
> -        if (pos <= address) {
> -            cap = MAX(pos, cap);
> -        }
> +    switch (len) {
> +    case 4:
> +        direct_mask =
> +            pci_get_long(assigned_dev->direct_config_read + address);
> +        break;
> +    case 2:
> +        direct_mask =
> +            pci_get_word(assigned_dev->direct_config_read + address);
> +        break;
> +    case 1:
> +        direct_mask =
> +            pci_get_byte(assigned_dev->direct_config_read + address);
> +        break;
> +    default:
> +        abort();
>      }
> -    return cap;
> -}
> -
> -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> -                                                    uint32_t address, int len)
> -{
> -    uint8_t cap, cap_id = pci_dev->config_map[address];
> -    uint32_t val;
> -
> -    switch (cap_id) {
> -
> -    case PCI_CAP_ID_VPD:
> -        cap = pci_find_capability(pci_dev, cap_id);
> -        val = assigned_dev_pci_read(pci_dev, address, len);
> -        return merge_bits(val, pci_get_long(pci_dev->config + address),
> -                          address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
> -
> -    case PCI_CAP_ID_VNDR:
> -        cap = find_vndr_start(pci_dev, address);
> -        val = assigned_dev_pci_read(pci_dev, address, len);
> -        return merge_bits(val, pci_get_long(pci_dev->config + address),
> -                          address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
> +    if (direct_mask) {
> +        real_val = assigned_dev_pci_read(pci_dev, address, len);
> +        return (virt_val & ~direct_mask) | (real_val & direct_mask);
> +    } else {
> +        return virt_val;
>      }
> -
> -    return pci_default_read_config(pci_dev, address, len);
>  }
>  
> -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> -                                                 uint32_t address,
> -                                                 uint32_t val, int len)
> +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
> +                                          uint32_t val, int len)
>  {
> -    uint8_t cap_id = pci_dev->config_map[address];
> +    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint32_t direct_mask, full_access;
>  
>      pci_default_write_config(pci_dev, address, val, len);
> -    switch (cap_id) {
> -    case PCI_CAP_ID_MSI:
> +
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
>          if (range_covers_byte(address, len,
>                                pci_dev->msi_cap + PCI_MSI_FLAGS)) {
>              assigned_dev_update_msi(pci_dev);
>          }
> -        break;
> -
> -    case PCI_CAP_ID_MSIX:
> +    } else if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
>          if (range_covers_byte(address, len,
>                                pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
>              assigned_dev_update_msix(pci_dev);
>          }
> -        break;
> +    }
>  
> -    case PCI_CAP_ID_VPD:
> -    case PCI_CAP_ID_VNDR:
> -        assigned_dev_pci_write(pci_dev, address, val, len);
> +    switch (len) {
> +    case 4:
> +        direct_mask =
> +            pci_get_long(assigned_dev->direct_config_write + address);
> +        full_access = 0xffffffff;
> +        break;
> +    case 2:
> +        direct_mask =
> +            pci_get_word(assigned_dev->direct_config_write + address);
> +        full_access = 0xffff;
>          break;
> +    case 1:
> +        direct_mask =
> +            pci_get_byte(assigned_dev->direct_config_write + address);
> +        full_access = 0xff;
> +        break;
> +    default:
> +        abort();
> +    }
> +    if (direct_mask != full_access) {
> +        val &= direct_mask;
> +        val |= assigned_dev_pci_read(pci_dev, address, len) & ~direct_mask;
>      }
> +    assigned_dev_pci_write(pci_dev, address, val, len);

I think we should not write at all for fully virtualized fields
(direct_mask == 0). E.g. PCI BARs.

>  }
>  
>  static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> @@ -1404,6 +1247,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>              return ret;
>          }
>  
> +        /* direct read except for next cap pointer */
> +        memset(dev->direct_config_read + pos, 0xff, PCI_PM_SIZEOF);
> +        dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0;
> +
>          pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
>          pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
>          pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> @@ -1438,6 +1285,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>              return ret;
>          }
>  
> +        /* direct read except for next cap pointer */
> +        memset(dev->direct_config_read + pos, 0xff, size);
> +        dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0;
> +
>          type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
>          type = (type & PCI_EXP_FLAGS_TYPE) >> 8;
>          if (type != PCI_EXP_TYPE_ENDPOINT &&
> @@ -1513,6 +1364,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>              return ret;
>          }
>  
> +        /* direct read except for next cap pointer */
> +        memset(dev->direct_config_read + pos, 0xff, 8);
> +        dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0;
> +
>          /* Command register, clear upper bits, including extended modes */
>          cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
>          cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> @@ -1534,6 +1389,13 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>          if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8)) < 0) {
>              return ret;
>          }
> +
> +        /* direct read except for next cap pointer */
> +        memset(dev->direct_config_read + pos, 0xff, 8);
> +        dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0;
> +
> +        /* direct write for cap content */
> +        memset(dev->direct_config_write + pos + 2, 0xff, 6);
>      }
>  
>      /* Devices can have multiple vendor capabilities, get them all */
> @@ -1545,6 +1407,21 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>                                        pos, len)) < 0) {
>              return ret;
>          }
> +
> +        /* direct read except for next cap pointer */
> +        memset(dev->direct_config_read + pos, 0xff, len);
> +        dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0;

Add a function for the above 3 lines?

> +
> +        /* direct write for cap content */
> +        memset(dev->direct_config_write + pos + 2, 0xff, len - 2);
> +    }
> +
> +    /* If real and virtual capability list status bits differ, virtualize the
> +     * access. */
> +    if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) !=
> +        (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) &
> +         PCI_STATUS_CAP_LIST)) {
> +        dev->direct_config_read[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST;
>      }

Is this an optimization? What harm does it do to virtualize always?

>  
>      return 0;
> @@ -1694,6 +1571,26 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
>          return -1;
>      }
>  
> +    /*
> +     * Set up basic config space access control. Will be further refined during
> +     * device initialization.
> +     *
> +     * Direct read/write access is grangted for:

typo

> +     *  - status & command registers, revision ID & class code, cacheline size,
> +     *    latency timer, header type, BIST
> +     *  - cardbus CIS pointer, subsystem vendor & ID
> +     *  - reserved fields
> +     *  - Min_Gnt & Max_Lat
> +     */
> +    memset(dev->direct_config_read + PCI_COMMAND, 0xff, 12);
> +    memset(dev->direct_config_read + PCI_CARDBUS_CIS, 0xff, 8);
> +    memset(dev->direct_config_read + PCI_CAPABILITY_LIST + 1, 0xff, 7);
> +    memset(dev->direct_config_read + PCI_MIN_GNT, 0xff, 2);
> +    memset(dev->direct_config_write + PCI_COMMAND, 0xff, 12);
> +    memset(dev->direct_config_write + PCI_CARDBUS_CIS, 0xff, 8);
> +    memset(dev->direct_config_write + PCI_CAPABILITY_LIST + 1, 0xff, 7);
> +    memset(dev->direct_config_write + PCI_MIN_GNT, 0xff, 2);
> +
>      if (get_real_device(dev, dev->host.seg, dev->host.bus,
>                          dev->host.dev, dev->host.func)) {
>          error_report("pci-assign: Error: Couldn't get real device (%s)!",
> diff --git a/hw/device-assignment.h b/hw/device-assignment.h
> index 3d20207..ed5defb 100644
> --- a/hw/device-assignment.h
> +++ b/hw/device-assignment.h
> @@ -104,12 +104,13 @@ typedef struct AssignedDevice {
>  #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
>          uint32_t state;
>      } cap;
> +    uint8_t direct_config_read[PCI_CONFIG_SPACE_SIZE];
> +    uint8_t direct_config_write[PCI_CONFIG_SPACE_SIZE];

What direct means isn't obvious. Add a comment?

We could revert the logic, have bits for
emulate_config_read/emulate_config_write.  I think that most of the
registers can safely be read/written directly, so protecting relevant
bits will be less work. Is that true?

>      int irq_entries_nr;
>      struct kvm_irq_routing_entry *entry;
>      void *msix_table_page;
>      target_phys_addr_t msix_table_addr;
>      int mmio_index;
> -    uint32_t emulate_cmd_mask;
>      char *configfd_name;
>      int32_t bootindex;
>      QLIST_ENTRY(AssignedDevice) next;
> -- 
> 1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux