On Mon, Jun 27, 2011 at 08:19:55PM +0200, Jan Kiszka wrote: > From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > > This drastically simplifies config space access management: Instead of > coding various range checks and merging bits, set up two access control > bitmaps. One defines, which bits can be directly read from the device, > the other allows direct write to the device, also with bit-granularity. > > The setup of those bitmaps, specifically the corner cases like the > capability list status bit, is way more readable than the existing code. > And the generic config access functions just need one additional logic > to catch writes to the MSI/MSI-X control flags and call the related > update handlers. > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > hw/device-assignment.c | 325 ++++++++++++++++------------------------------- > hw/device-assignment.h | 3 +- > 2 files changed, 113 insertions(+), 215 deletions(-) > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c > index 6a2ca8c..be199d2 100644 > --- a/hw/device-assignment.c > +++ b/hw/device-assignment.c > @@ -63,35 +63,6 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev); > > static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev); > > -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, > - uint32_t address, > - uint32_t val, int len); > - > -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev, > - uint32_t address, int len); > - > -/* Merge the bits set in mask from mval into val. Both val and mval are > - * at the same addr offset, pos is the starting offset of the mask. */ > -static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr, > - int len, uint8_t pos, uint32_t mask) > -{ > - if (!ranges_overlap(addr, len, pos, 4)) { > - return val; > - } > - > - if (addr >= pos) { > - mask >>= (addr - pos) * 8; > - } else { > - mask <<= (pos - addr) * 8; > - } > - mask &= 0xffffffffU >> (4 - len) * 8; > - > - val &= ~mask; > - val |= (mval & mask); > - > - return val; > -} > - > static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region, > uint32_t addr, int len, uint32_t *val) > { > @@ -404,143 +375,6 @@ static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start) > return 0; > } > > -static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address, > - uint32_t val, int len) > -{ > - int fd; > - ssize_t ret; > - AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d); > - > - DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > - ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > - (uint16_t) address, val, len); > - > - if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) { > - return assigned_device_pci_cap_write_config(d, address, val, len); > - } > - > - if (ranges_overlap(address, len, PCI_COMMAND, 2)) { > - pci_default_write_config(d, address, val, len); > - /* Continue to program the card */ > - } > - > - /* > - * Catch access to > - * - base address registers > - * - ROM base address & capability pointer > - * - interrupt line & pin > - */ > - if (ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) || > - ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) { > - pci_default_write_config(d, address, val, len); > - return; > - } else if (ranges_overlap(address, len, PCI_CAPABILITY_LIST, 1) || > - ranges_overlap(address, len, PCI_INTERRUPT_LINE, 2)) { > - uint32_t real_val; > - > - pci_default_write_config(d, address, val, len); > - > - /* Ensure that writes to overlapping areas we don't virtualize still > - * hit the device. */ > - real_val = assigned_dev_pci_read(d, address, len); > - val = merge_bits(val, real_val, address, len, > - PCI_CAPABILITY_LIST, 0xff); > - val = merge_bits(val, real_val, address, len, > - PCI_INTERRUPT_LINE, 0xffff); > - } > - > - DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n", > - ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > - (uint16_t) address, val, len); > - > - fd = pci_dev->real_device.config_fd; > - > -again: > - ret = pwrite(fd, &val, len, address); > - if (ret != len) { > - if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) > - goto again; > - > - fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n", > - __func__, ret, errno); > - > - exit(1); > - } > -} > - > -static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address, > - int len) > -{ > - uint32_t val = 0, virt_val; > - int fd; > - ssize_t ret; > - AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d); > - > - if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) { > - val = assigned_device_pci_cap_read_config(d, address, len); > - DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > - return val; > - } > - > - /* > - * Catch access to > - * - vendor & device ID > - * - base address registers > - * - ROM base address > - */ > - if (ranges_overlap(address, len, PCI_VENDOR_ID, 4) || > - ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) || > - ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) { > - val = pci_default_read_config(d, address, len); > - DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > - return val; > - } > - > - fd = pci_dev->real_device.config_fd; > - > -again: > - ret = pread(fd, &val, len, address); > - if (ret != len) { > - if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) > - goto again; > - > - fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n", > - __func__, ret, errno); > - > - exit(1); > - } > - > - DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > - > - if (pci_dev->emulate_cmd_mask) { > - val = merge_bits(val, pci_default_read_config(d, address, len), > - address, len, PCI_COMMAND, pci_dev->emulate_cmd_mask); > - } > - > - /* > - * Merge bits from virtualized > - * - capability pointer > - * - interrupt line & pin > - */ > - virt_val = pci_default_read_config(d, address, len); > - val = merge_bits(val, virt_val, address, len, PCI_CAPABILITY_LIST, 0xff); > - val = merge_bits(val, virt_val, address, len, PCI_INTERRUPT_LINE, 0xffff); > - > - if (!pci_dev->cap.available) { > - /* kill the special capabilities */ > - if (address == PCI_COMMAND && len == 4) { > - val &= ~(PCI_STATUS_CAP_LIST << 16); > - } else if (address == PCI_STATUS) { > - val &= ~PCI_STATUS_CAP_LIST; > - } > - } > - > - return val; > -} > - > static int assigned_dev_register_regions(PCIRegion *io_regions, > unsigned long regions_num, > AssignedDevice *pci_dev) > @@ -790,7 +624,8 @@ again: > /* dealing with virtual function device */ > snprintf(name, sizeof(name), "%sphysfn/", dir); > if (!stat(name, &statbuf)) { > - pci_dev->emulate_cmd_mask = 0xffff; > + /* always provide the written value on readout */ > + memset(pci_dev->direct_config_read + PCI_COMMAND, 0xff, 2); > } > > dev->region_number = r; > @@ -1268,73 +1103,81 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev) > } > } > > -/* There can be multiple VNDR capabilities per device, we need to find the > - * one that starts closet to the given address without going over. */ > -static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address) > +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, > + uint32_t address, int len) > { > - uint8_t cap, pos; > + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); > + uint32_t virt_val = pci_default_read_config(pci_dev, address, len); > + uint32_t real_val, direct_mask; > > - for (cap = pos = 0; > - (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos)); > - pos += PCI_CAP_LIST_NEXT) { > - if (pos <= address) { > - cap = MAX(pos, cap); > - } > + switch (len) { > + case 4: > + direct_mask = > + pci_get_long(assigned_dev->direct_config_read + address); > + break; > + case 2: > + direct_mask = > + pci_get_word(assigned_dev->direct_config_read + address); > + break; > + case 1: > + direct_mask = > + pci_get_byte(assigned_dev->direct_config_read + address); > + break; > + default: > + abort(); > } > - return cap; > -} > - > -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev, > - uint32_t address, int len) > -{ > - uint8_t cap, cap_id = pci_dev->config_map[address]; > - uint32_t val; > - > - switch (cap_id) { > - > - case PCI_CAP_ID_VPD: > - cap = pci_find_capability(pci_dev, cap_id); > - val = assigned_dev_pci_read(pci_dev, address, len); > - return merge_bits(val, pci_get_long(pci_dev->config + address), > - address, len, cap + PCI_CAP_LIST_NEXT, 0xff); > - > - case PCI_CAP_ID_VNDR: > - cap = find_vndr_start(pci_dev, address); > - val = assigned_dev_pci_read(pci_dev, address, len); > - return merge_bits(val, pci_get_long(pci_dev->config + address), > - address, len, cap + PCI_CAP_LIST_NEXT, 0xff); > + if (direct_mask) { > + real_val = assigned_dev_pci_read(pci_dev, address, len); > + return (virt_val & ~direct_mask) | (real_val & direct_mask); > + } else { > + return virt_val; > } > - > - return pci_default_read_config(pci_dev, address, len); > } > > -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, > - uint32_t address, > - uint32_t val, int len) > +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address, > + uint32_t val, int len) > { > - uint8_t cap_id = pci_dev->config_map[address]; > + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); > + uint32_t direct_mask, full_access; > > pci_default_write_config(pci_dev, address, val, len); > - switch (cap_id) { > - case PCI_CAP_ID_MSI: > + > + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { > if (range_covers_byte(address, len, > pci_dev->msi_cap + PCI_MSI_FLAGS)) { > assigned_dev_update_msi(pci_dev); > } > - break; > - > - case PCI_CAP_ID_MSIX: > + } else if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { > if (range_covers_byte(address, len, > pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) { > assigned_dev_update_msix(pci_dev); > } > - break; > + } > > - case PCI_CAP_ID_VPD: > - case PCI_CAP_ID_VNDR: > - assigned_dev_pci_write(pci_dev, address, val, len); > + switch (len) { > + case 4: > + direct_mask = > + pci_get_long(assigned_dev->direct_config_write + address); > + full_access = 0xffffffff; > + break; > + case 2: > + direct_mask = > + pci_get_word(assigned_dev->direct_config_write + address); > + full_access = 0xffff; > break; > + case 1: > + direct_mask = > + pci_get_byte(assigned_dev->direct_config_write + address); > + full_access = 0xff; > + break; > + default: > + abort(); > + } > + if (direct_mask != full_access) { > + val &= direct_mask; > + val |= assigned_dev_pci_read(pci_dev, address, len) & ~direct_mask; > } > + assigned_dev_pci_write(pci_dev, address, val, len); I think we should not write at all for fully virtualized fields (direct_mask == 0). E.g. PCI BARs. > } > > static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > @@ -1404,6 +1247,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > return ret; > } > > + /* direct read except for next cap pointer */ > + memset(dev->direct_config_read + pos, 0xff, PCI_PM_SIZEOF); > + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0; > + > pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS); > pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI); > pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc); > @@ -1438,6 +1285,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > return ret; > } > > + /* direct read except for next cap pointer */ > + memset(dev->direct_config_read + pos, 0xff, size); > + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0; > + > type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS); > type = (type & PCI_EXP_FLAGS_TYPE) >> 8; > if (type != PCI_EXP_TYPE_ENDPOINT && > @@ -1513,6 +1364,10 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > return ret; > } > > + /* direct read except for next cap pointer */ > + memset(dev->direct_config_read + pos, 0xff, 8); > + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0; > + > /* Command register, clear upper bits, including extended modes */ > cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD); > cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ | > @@ -1534,6 +1389,13 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8)) < 0) { > return ret; > } > + > + /* direct read except for next cap pointer */ > + memset(dev->direct_config_read + pos, 0xff, 8); > + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0; > + > + /* direct write for cap content */ > + memset(dev->direct_config_write + pos + 2, 0xff, 6); > } > > /* Devices can have multiple vendor capabilities, get them all */ > @@ -1545,6 +1407,21 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) > pos, len)) < 0) { > return ret; > } > + > + /* direct read except for next cap pointer */ > + memset(dev->direct_config_read + pos, 0xff, len); > + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] = 0; Add a function for the above 3 lines? > + > + /* direct write for cap content */ > + memset(dev->direct_config_write + pos + 2, 0xff, len - 2); > + } > + > + /* If real and virtual capability list status bits differ, virtualize the > + * access. */ > + if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) != > + (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) & > + PCI_STATUS_CAP_LIST)) { > + dev->direct_config_read[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; > } Is this an optimization? What harm does it do to virtualize always? > > return 0; > @@ -1694,6 +1571,26 @@ static int assigned_initfn(struct PCIDevice *pci_dev) > return -1; > } > > + /* > + * Set up basic config space access control. Will be further refined during > + * device initialization. > + * > + * Direct read/write access is grangted for: typo > + * - status & command registers, revision ID & class code, cacheline size, > + * latency timer, header type, BIST > + * - cardbus CIS pointer, subsystem vendor & ID > + * - reserved fields > + * - Min_Gnt & Max_Lat > + */ > + memset(dev->direct_config_read + PCI_COMMAND, 0xff, 12); > + memset(dev->direct_config_read + PCI_CARDBUS_CIS, 0xff, 8); > + memset(dev->direct_config_read + PCI_CAPABILITY_LIST + 1, 0xff, 7); > + memset(dev->direct_config_read + PCI_MIN_GNT, 0xff, 2); > + memset(dev->direct_config_write + PCI_COMMAND, 0xff, 12); > + memset(dev->direct_config_write + PCI_CARDBUS_CIS, 0xff, 8); > + memset(dev->direct_config_write + PCI_CAPABILITY_LIST + 1, 0xff, 7); > + memset(dev->direct_config_write + PCI_MIN_GNT, 0xff, 2); > + > if (get_real_device(dev, dev->host.seg, dev->host.bus, > dev->host.dev, dev->host.func)) { > error_report("pci-assign: Error: Couldn't get real device (%s)!", > diff --git a/hw/device-assignment.h b/hw/device-assignment.h > index 3d20207..ed5defb 100644 > --- a/hw/device-assignment.h > +++ b/hw/device-assignment.h > @@ -104,12 +104,13 @@ typedef struct AssignedDevice { > #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2) > uint32_t state; > } cap; > + uint8_t direct_config_read[PCI_CONFIG_SPACE_SIZE]; > + uint8_t direct_config_write[PCI_CONFIG_SPACE_SIZE]; What direct means isn't obvious. Add a comment? We could revert the logic, have bits for emulate_config_read/emulate_config_write. I think that most of the registers can safely be read/written directly, so protecting relevant bits will be less work. Is that true? > int irq_entries_nr; > struct kvm_irq_routing_entry *entry; > void *msix_table_page; > target_phys_addr_t msix_table_addr; > int mmio_index; > - uint32_t emulate_cmd_mask; > char *configfd_name; > int32_t bootindex; > QLIST_ENTRY(AssignedDevice) next; > -- > 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html