Re: [PATCH 2/7] pci: memory access API and IOMMU support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
> PCI devices should access memory through pci_memory_*() instead of
> cpu_physical_memory_*(). This also provides support for translation and
> access checking in case an IOMMU is emulated.
> 
> Memory maps are treated as remote IOTLBs (that is, translation caches
> belonging to the IOMMU-aware device itself). Clients (devices) must
> provide callbacks for map invalidation in case these maps are
> persistent beyond the current I/O context, e.g. AIO DMA transfers.
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@xxxxxxxxxxx>


I am concerned about adding more pointer chaising on data path.
Could we have
1. an iommu pointer in a device, inherited by secondary buses
   when they are created and by devices from buses when they are attached.
2. translation pointer in the iommu instead of the bus
3. pci_memory_XX functions inline, doing fast path for non-iommu case:
   
	if (__builtin_expect(!dev->iommu, 1)
		return cpu_memory_rw
	


> ---
>  hw/pci.c           |  185 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/pci.h           |   74 +++++++++++++++++++++
>  hw/pci_internals.h |   12 ++++
>  qemu-common.h      |    1 +
>  4 files changed, 271 insertions(+), 1 deletions(-)

Almost nothing here is PCI specific.
Can this code go into dma.c/dma.h?
We would have struct DMADevice, APIs like device_dma_write etc.
This would help us get rid of the void * stuff as well?

> 
> diff --git a/hw/pci.c b/hw/pci.c
> index 2dc1577..b460905 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -158,6 +158,19 @@ static void pci_device_reset(PCIDevice *dev)
>      pci_update_mappings(dev);
>  }
>  
> +static int pci_no_translate(PCIDevice *iommu,
> +                            PCIDevice *dev,
> +                            pcibus_t addr,
> +                            target_phys_addr_t *paddr,
> +                            target_phys_addr_t *len,
> +                            unsigned perms)
> +{
> +    *paddr = addr;
> +    *len = -1;
> +
> +    return 0;
> +}
> +
>  static void pci_bus_reset(void *opaque)
>  {
>      PCIBus *bus = opaque;
> @@ -220,7 +233,10 @@ void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
>  {
>      qbus_create_inplace(&bus->qbus, &pci_bus_info, parent, name);
>      assert(PCI_FUNC(devfn_min) == 0);
> -    bus->devfn_min = devfn_min;
> +
> +    bus->devfn_min  = devfn_min;
> +    bus->iommu      = NULL;
> +    bus->translate  = pci_no_translate;
>  
>      /* host bridge */
>      QLIST_INIT(&bus->child);
> @@ -1789,3 +1805,170 @@ static char *pcibus_get_dev_path(DeviceState *dev)
>      return strdup(path);
>  }
>  
> +void pci_register_iommu(PCIDevice *iommu,
> +                        PCITranslateFunc *translate)
> +{
> +    iommu->bus->iommu = iommu;
> +    iommu->bus->translate = translate;
> +}
> +

The above seems broken for secondary buses, right?  Also, can we use
qdev for initialization in some way, instead of adding more APIs?  E.g.
I think it would be nice if we could just use qdev command line flags to
control which bus is behind iommu and which isn't.


> +void pci_memory_rw(PCIDevice *dev,
> +                   pcibus_t addr,
> +                   uint8_t *buf,
> +                   pcibus_t len,
> +                   int is_write)
> +{
> +    int err;
> +    unsigned perms;
> +    PCIDevice *iommu = dev->bus->iommu;
> +    target_phys_addr_t paddr, plen;
> +
> +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> +
> +    while (len) {
> +        err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> +        if (err)
> +            return;
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len)
> +            plen = len;
> +
> +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +}
> +
> +static void pci_memory_register_map(PCIDevice *dev,
> +                                    pcibus_t addr,
> +                                    pcibus_t len,
> +                                    target_phys_addr_t paddr,
> +                                    PCIInvalidateMapFunc *invalidate,
> +                                    void *invalidate_opaque)
> +{
> +    PCIMemoryMap *map;
> +
> +    map = qemu_malloc(sizeof(PCIMemoryMap));
> +    map->addr               = addr;
> +    map->len                = len;
> +    map->paddr              = paddr;
> +    map->invalidate         = invalidate;
> +    map->invalidate_opaque  = invalidate_opaque;
> +
> +    QLIST_INSERT_HEAD(&dev->memory_maps, map, list);
> +}
> +
> +static void pci_memory_unregister_map(PCIDevice *dev,
> +                                      target_phys_addr_t paddr,
> +                                      target_phys_addr_t len)
> +{
> +    PCIMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> +        if (map->paddr == paddr && map->len == len) {
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void pci_memory_invalidate_range(PCIDevice *dev,
> +                                 pcibus_t addr,
> +                                 pcibus_t len)
> +{
> +    PCIMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> +            map->invalidate(map->invalidate_opaque);
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void *pci_memory_map(PCIDevice *dev,
> +                     PCIInvalidateMapFunc *cb,
> +                     void *opaque,
> +                     pcibus_t addr,
> +                     target_phys_addr_t *len,
> +                     int is_write)
> +{
> +    int err;
> +    unsigned perms;
> +    PCIDevice *iommu = dev->bus->iommu;
> +    target_phys_addr_t paddr, plen;
> +
> +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> +
> +    plen = *len;
> +    err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> +    if (err)
> +        return NULL;
> +
> +    /*
> +     * If this is true, the virtual region is contiguous,
> +     * but the translated physical region isn't. We just
> +     * clamp *len, much like cpu_physical_memory_map() does.
> +     */
> +    if (plen < *len)
> +        *len = plen;
> +
> +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> +    if (cb)
> +        pci_memory_register_map(dev, addr, *len, paddr, cb, opaque);
> +
> +    return cpu_physical_memory_map(paddr, len, is_write);
> +}
> +

All the above is really only useful for when there is an iommu,
right? So maybe we should shortcut all this if there's no iommu?

> +void pci_memory_unmap(PCIDevice *dev,
> +                      void *buffer,
> +                      target_phys_addr_t len,
> +                      int is_write,
> +                      target_phys_addr_t access_len)
> +{
> +    cpu_physical_memory_unmap(buffer, len, is_write, access_len);
> +    pci_memory_unregister_map(dev, (target_phys_addr_t) buffer, len);
> +}
> +
> +#define DEFINE_PCI_LD(suffix, size)                                       \
> +uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr)              \
> +{                                                                         \
> +    int err;                                                              \
> +    target_phys_addr_t paddr, plen;                                       \
> +                                                                          \
> +    err = dev->bus->translate(dev->bus->iommu, dev,                       \
> +                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
> +    if (err || (plen < size / 8))                                         \
> +        return 0;                                                         \
> +                                                                          \
> +    return ld##suffix##_phys(paddr);                                      \
> +}
> +
> +#define DEFINE_PCI_ST(suffix, size)                                       \
> +void pci_st##suffix(PCIDevice *dev, pcibus_t addr, uint##size##_t val)    \
> +{                                                                         \
> +    int err;                                                              \
> +    target_phys_addr_t paddr, plen;                                       \
> +                                                                          \
> +    err = dev->bus->translate(dev->bus->iommu, dev,                       \
> +                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
> +    if (err || (plen < size / 8))                                         \
> +        return;                                                           \
> +                                                                          \
> +    st##suffix##_phys(paddr, val);                                        \
> +}
> +
> +DEFINE_PCI_LD(ub, 8)
> +DEFINE_PCI_LD(uw, 16)
> +DEFINE_PCI_LD(l, 32)
> +DEFINE_PCI_LD(q, 64)
> +
> +DEFINE_PCI_ST(b, 8)
> +DEFINE_PCI_ST(w, 16)
> +DEFINE_PCI_ST(l, 32)
> +DEFINE_PCI_ST(q, 64)
> +
> diff --git a/hw/pci.h b/hw/pci.h
> index c551f96..3131016 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -172,6 +172,8 @@ struct PCIDevice {
>      char *romfile;
>      ram_addr_t rom_offset;
>      uint32_t rom_bar;
> +
> +    QLIST_HEAD(, PCIMemoryMap) memory_maps;
>  };
>  
>  PCIDevice *pci_register_device(PCIBus *bus, const char *name,
> @@ -391,4 +393,76 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1,
>      return !(last2 < first1 || last1 < first2);
>  }
>  
> +/*
> + * Memory I/O and PCI IOMMU definitions.
> + */
> +
> +#define IOMMU_PERM_READ     (1 << 0)
> +#define IOMMU_PERM_WRITE    (1 << 1)
> +#define IOMMU_PERM_RW       (IOMMU_PERM_READ | IOMMU_PERM_WRITE)
> +
> +typedef int PCIInvalidateMapFunc(void *opaque);
> +typedef int PCITranslateFunc(PCIDevice *iommu,
> +                             PCIDevice *dev,
> +                             pcibus_t addr,
> +                             target_phys_addr_t *paddr,
> +                             target_phys_addr_t *len,
> +                             unsigned perms);
> +
> +extern void pci_memory_rw(PCIDevice *dev,
> +                          pcibus_t addr,
> +                          uint8_t *buf,
> +                          pcibus_t len,
> +                          int is_write);
> +extern void *pci_memory_map(PCIDevice *dev,
> +                            PCIInvalidateMapFunc *cb,
> +                            void *opaque,
> +                            pcibus_t addr,
> +                            target_phys_addr_t *len,
> +                            int is_write);
> +extern void pci_memory_unmap(PCIDevice *dev,
> +                             void *buffer,
> +                             target_phys_addr_t len,
> +                             int is_write,
> +                             target_phys_addr_t access_len);
> +extern void pci_register_iommu(PCIDevice *dev,
> +                               PCITranslateFunc *translate);
> +extern void pci_memory_invalidate_range(PCIDevice *dev,
> +                                        pcibus_t addr,
> +                                        pcibus_t len);
> +
> +#define DECLARE_PCI_LD(suffix, size)                                    \
> +extern uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr);
> +
> +#define DECLARE_PCI_ST(suffix, size)                                    \
> +extern void pci_st##suffix(PCIDevice *dev,                              \
> +                           pcibus_t addr,                               \
> +                           uint##size##_t val);
> +
> +DECLARE_PCI_LD(ub, 8)
> +DECLARE_PCI_LD(uw, 16)
> +DECLARE_PCI_LD(l, 32)
> +DECLARE_PCI_LD(q, 64)
> +
> +DECLARE_PCI_ST(b, 8)
> +DECLARE_PCI_ST(w, 16)
> +DECLARE_PCI_ST(l, 32)
> +DECLARE_PCI_ST(q, 64)
> +
> +static inline void pci_memory_read(PCIDevice *dev,
> +                                   pcibus_t addr,
> +                                   uint8_t *buf,
> +                                   pcibus_t len)
> +{
> +    pci_memory_rw(dev, addr, buf, len, 0);
> +}
> +
> +static inline void pci_memory_write(PCIDevice *dev,
> +                                    pcibus_t addr,
> +                                    const uint8_t *buf,
> +                                    pcibus_t len)
> +{
> +    pci_memory_rw(dev, addr, (uint8_t *) buf, len, 1);
> +}
> +
>  #endif
> diff --git a/hw/pci_internals.h b/hw/pci_internals.h
> index e3c93a3..fb134b9 100644
> --- a/hw/pci_internals.h
> +++ b/hw/pci_internals.h
> @@ -33,6 +33,9 @@ struct PCIBus {
>         Keep a count of the number of devices with raised IRQs.  */
>      int nirq;
>      int *irq_count;
> +
> +    PCIDevice                       *iommu;
> +    PCITranslateFunc                *translate;
>  };

Why is translate pointer in a bus? I think it's a work of an iommu?

>  struct PCIBridge {
> @@ -44,4 +47,13 @@ struct PCIBridge {
>      const char *bus_name;
>  };
>  
> +struct PCIMemoryMap {
> +    pcibus_t                        addr;
> +    pcibus_t                        len;
> +    target_phys_addr_t              paddr;
> +    PCIInvalidateMapFunc            *invalidate;
> +    void                            *invalidate_opaque;

Can we have a structure that encapsulates the mapping
data instead of a void *?


> +    QLIST_ENTRY(PCIMemoryMap)       list;
> +};
> +
>  #endif /* QEMU_PCI_INTERNALS_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index d735235..8b060e8 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -218,6 +218,7 @@ typedef struct SMBusDevice SMBusDevice;
>  typedef struct PCIHostState PCIHostState;
>  typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIBus PCIBus;
> +typedef struct PCIMemoryMap PCIMemoryMap;
>  typedef struct PCIDevice PCIDevice;
>  typedef struct PCIBridge PCIBridge;
>  typedef struct SerialState SerialState;
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux