On Fri, Jan 17, 2020 at 8:52 AM Jason Wang <jasowang@xxxxxxxxxx> wrote:
On 2020/1/17 下午8:13, Michael S. Tsirkin wrote:
> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
>> vDPA device is a device that uses a datapath which complies with the
>> virtio specifications with vendor specific control path. vDPA devices
>> can be both physically located on the hardware or emulated by
>> software. vDPA hardware devices are usually implemented through PCIE
>> with the following types:
>>
>> - PF (Physical Function) - A single Physical Function
>> - VF (Virtual Function) - Device that supports single root I/O
>> virtualization (SR-IOV). Its Virtual Function (VF) represents a
>> virtualized instance of the device that can be assigned to different
>> partitions
>> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>> IOV, a virtual device composed by host OS utilizing one or more
>> ADIs.
>> - SF (Sub function) - Vendor specific interface to slice the Physical
>> Function to multiple sub functions that can be assigned to different
>> partitions as virtual devices.
>>
>> >From a driver's perspective, depends on how and where the DMA
>> translation is done, vDPA devices are split into two types:
>>
>> - Platform specific DMA translation - From the driver's perspective,
>> the device can be used on a platform where device access to data in
>> memory is limited and/or translated. An example is a PCIE vDPA whose
>> DMA request was tagged via a bus (e.g PCIE) specific way. DMA
>> translation and protection are done at PCIE bus IOMMU level.
>> - Device specific DMA translation - The device implements DMA
>> isolation and protection through its own logic. An example is a vDPA
>> device which uses on-chip IOMMU.
>>
>> To hide the differences and complexity of the above types for a vDPA
>> device/IOMMU options and in order to present a generic virtio device
>> to the upper layer, a device agnostic framework is required.
>>
>> This patch introduces a software vDPA bus which abstracts the
>> common attributes of vDPA device, vDPA bus driver and the
>> communication method (vdpa_config_ops) between the vDPA device
>> abstraction and the vDPA bus driver:
>>
>> With the abstraction of vDPA bus and vDPA bus operations, the
>> difference and complexity of the under layer hardware is hidden from
>> upper layer. The vDPA bus drivers on top can use a unified
>> vdpa_config_ops to control different types of vDPA device.
>>
>> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
>> ---
>> MAINTAINERS | 1 +
>> drivers/virtio/Kconfig | 2 +
>> drivers/virtio/Makefile | 1 +
>> drivers/virtio/vdpa/Kconfig | 9 ++
>> drivers/virtio/vdpa/Makefile | 2 +
>> drivers/virtio/vdpa/vdpa.c | 141 ++++++++++++++++++++++++++
>> include/linux/vdpa.h | 191 +++++++++++++++++++++++++++++++++++
>> 7 files changed, 347 insertions(+)
>> create mode 100644 drivers/virtio/vdpa/Kconfig
>> create mode 100644 drivers/virtio/vdpa/Makefile
>> create mode 100644 drivers/virtio/vdpa/vdpa.c
>> create mode 100644 include/linux/vdpa.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index d4bda9c900fa..578d2a581e3b 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -17540,6 +17540,7 @@ F: tools/virtio/
>> F: drivers/net/virtio_net.c
>> F: drivers/block/virtio_blk.c
>> F: include/linux/virtio*.h
>> +F: include/linux/vdpa.h
>> F: include/uapi/linux/virtio_*.h
>> F: drivers/crypto/virtio/
>> F: mm/balloon_compaction.c
>> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
>> index 078615cf2afc..9c4fdb64d9ac 100644
>> --- a/drivers/virtio/Kconfig
>> +++ b/drivers/virtio/Kconfig
>> @@ -96,3 +96,5 @@ config VIRTIO_MMIO_CMDLINE_DEVICES
>> If unsure, say 'N'.
>>
>> endif # VIRTIO_MENU
>> +
>> +source "drivers/virtio/vdpa/Kconfig"
>> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
>> index 3a2b5c5dcf46..fdf5eacd0d0a 100644
>> --- a/drivers/virtio/Makefile
>> +++ b/drivers/virtio/Makefile
>> @@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>> virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>> obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
>> obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
>> +obj-$(CONFIG_VDPA) += vdpa/
>> diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
>> new file mode 100644
>> index 000000000000..3032727b4d98
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/Kconfig
>> @@ -0,0 +1,9 @@
>> +# SPDX-License-Identifier: GPL-2.0-only
>> +config VDPA
>> + tristate
>> + default n
>> + help
>> + Enable this module to support vDPA device that uses a
>> + datapath which complies with virtio specifications with
>> + vendor specific control path.
>> +
>> diff --git a/drivers/virtio/vdpa/Makefile b/drivers/virtio/vdpa/Makefile
>> new file mode 100644
>> index 000000000000..ee6a35e8a4fb
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/Makefile
>> @@ -0,0 +1,2 @@
>> +# SPDX-License-Identifier: GPL-2.0
>> +obj-$(CONFIG_VDPA) += vdpa.o
>> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
>> new file mode 100644
>> index 000000000000..2b0e4a9f105d
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/vdpa.c
>> @@ -0,0 +1,141 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * vDPA bus.
>> + *
>> + * Copyright (c) 2019, Red Hat. All rights reserved.
>> + * Author: Jason Wang <jasowang@xxxxxxxxxx>
>> + *
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/idr.h>
>> +#include <linux/vdpa.h>
>> +
>> +#define MOD_VERSION "0.1"
>> +#define MOD_DESC "vDPA bus"
>> +#define MOD_AUTHOR "Jason Wang <jasowang@xxxxxxxxxx>"
>> +#define MOD_LICENSE "GPL v2"
>> +
>> +static DEFINE_IDA(vdpa_index_ida);
>> +
>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
>> +{
>> + return vdpa->dev.parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_get_parent);
>> +
>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
>> +{
>> + vdpa->dev.parent = parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_set_parent);
>> +
>> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
>> +{
>> + return container_of(_dev, struct vdpa_device, dev);
>> +}
>> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
>> +
>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
>> +{
>> + return &vdpa->dev;
>> +}
>> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
>> +
>> +static int vdpa_dev_probe(struct device *d)
>> +{
>> + struct vdpa_device *dev = dev_to_vdpa(d);
>> + struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
>> + int ret = 0;
>> +
>> + if (drv && drv->probe)
>> + ret = drv->probe(d);
>> +
>> + return ret;
>> +}
>> +
>> +static int vdpa_dev_remove(struct device *d)
>> +{
>> + struct vdpa_device *dev = dev_to_vdpa(d);
>> + struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
>> +
>> + if (drv && drv->remove)
>> + drv->remove(d);
>> +
>> + return 0;
>> +}
>> +
>> +static struct bus_type vdpa_bus = {
>> + .name = "vdpa",
>> + .probe = vdpa_dev_probe,
>> + .remove = vdpa_dev_remove,
>> +};
>> +
>> +int register_vdpa_device(struct vdpa_device *vdpa)
>> +{
>> + int err;
>> +
>> + if (!vdpa_get_parent(vdpa))
>> + return -EINVAL;
>> +
>> + if (!vdpa->config)
>> + return -EINVAL;
>> +
>> + err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
>> + if (err < 0)
>> + return -EFAULT;
>> +
>> + vdpa->dev.bus = &vdpa_bus;
>> + device_initialize(&vdpa->dev);
>> +
>> + vdpa->index = err;
>> + dev_set_name(&vdpa->dev, "vdpa%u", vdpa->index);
>> +
>> + err = device_add(&vdpa->dev);
>> + if (err)
>> + ida_simple_remove(&vdpa_index_ida, vdpa->index);
>> +
>> + return err;
>> +}
>> +EXPORT_SYMBOL_GPL(register_vdpa_device);
>> +
>> +void unregister_vdpa_device(struct vdpa_device *vdpa)
>> +{
>> + int index = vdpa->index;
>> +
>> + device_unregister(&vdpa->dev);
>> + ida_simple_remove(&vdpa_index_ida, index);
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_vdpa_device);
>> +
>> +int register_vdpa_driver(struct vdpa_driver *driver)
>> +{
>> + driver->drv.bus = &vdpa_bus;
>> + return driver_register(&driver->drv);
>> +}
>> +EXPORT_SYMBOL_GPL(register_vdpa_driver);
>> +
>> +void unregister_vdpa_driver(struct vdpa_driver *driver)
>> +{
>> + driver_unregister(&driver->drv);
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_vdpa_driver);
>> +
>> +static int vdpa_init(void)
>> +{
>> + if (bus_register(&vdpa_bus) != 0)
>> + panic("virtio bus registration failed");
>> + return 0;
>> +}
>> +
>> +static void __exit vdpa_exit(void)
>> +{
>> + bus_unregister(&vdpa_bus);
>> + ida_destroy(&vdpa_index_ida);
>> +}
>> +core_initcall(vdpa_init);
>> +module_exit(vdpa_exit);
>> +
>> +MODULE_VERSION(MOD_VERSION);
>> +MODULE_AUTHOR(MOD_AUTHOR);
>> +MODULE_LICENSE(MOD_LICENSE);
>> diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
>> new file mode 100644
>> index 000000000000..47760137ef66
>> --- /dev/null
>> +++ b/include/linux/vdpa.h
>> @@ -0,0 +1,191 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_VDPA_H
>> +#define _LINUX_VDPA_H
>> +
>> +#include <linux/device.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/vhost_iotlb.h>
>> +
>> +/**
>> + * vDPA callback definition.
>> + * @callback: interrupt callback function
>> + * @private: the data passed to the callback function
>> + */
>> +struct vdpa_callback {
>> + irqreturn_t (*callback)(void *data);
>> + void *private;
>> +};
>> +
>> +/**
>> + * vDPA device - representation of a vDPA device
>> + * @dev: underlying device
>> + * @config: the configuration ops for this device.
>> + * @index: device index
>> + */
>> +struct vdpa_device {
>> + struct device dev;
>> + const struct vdpa_config_ops *config;
>> + int index;
>> +};
>> +
>> +/**
>> + * vDPA_config_ops - operations for configuring a vDPA device.
>> + * Note: vDPA device drivers are required to implement all of the
>> + * operations unless it is optional mentioned in the following list.
>> + * @set_vq_address: Set the address of virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * @desc_area: address of desc area
>> + * @driver_area: address of driver area
>> + * @device_area: address of device area
>> + * Returns integer: success (0) or error (< 0)
>> + * @set_vq_num: Set the size of virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * @num: the size of virtqueue
>> + * @kick_vq: Kick the virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>
> This seems wrong: kicks are data path so drivers should not
> do it in a vendor specific way.
I'm not sure I get this since the doorbell is pretty vendor specific.
The idea here is to start form simple and common cases that can work for
both kernel virtio drivers and vhost:
- For kernel, kick_vq() is called from vq->notify() directly
- For vhost, vhost is in charge of hook eventfd to kick_vq()
> How about an API
> returning the device/resource that can then be
> mapped as appropriate?
>
Yes, this could be a further optimization on top but not a must (only
work for e.g the doorbell does not share MMIO space with other
functions). For vhost we need something like this and need to hook it to
mmap() of vhost file descriptor.
>> + * @set_vq_cb: Set the interrupt callback function for
>> + * a virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * @cb: virtio-vdev interrupt callback structure
>
> Calls are data path too, I think we need some way to map MSI?
Similarly, this could be a optimization on top, and we can start from
simple and common cases:
- For kernel, the vq callback could be mapped to MSI interrupt handler
directly
- For vhost, eventfd wakeup could be hook in the cb here
>
>> + * @set_vq_ready: Set ready status for a virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * @ready: ready (true) not ready(false)
>> + * @get_vq_ready: Get ready status for a virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * Returns boolean: ready (true) or not (false)
>> + * @set_vq_state: Set the state for a virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * @state: virtqueue state (last_avail_idx)
>> + * Returns integer: success (0) or error (< 0)
>> + * @get_vq_state: Get the state for a virtqueue
>> + * @vdev: vdpa device
>> + * @idx: virtqueue index
>> + * Returns virtqueue state (last_avail_idx)
>> + * @get_vq_align: Get the virtqueue align requirement
>> + * for the device
>> + * @vdev: vdpa device
>> + * Returns virtqueue algin requirement
>
> Where does this come from? Spec dictates that for a data path,
> vendor specific values for this will break userspace ...
It comes from the align parameter of vring_create_virtqueue(). We can
expose the alignment to userspace if necessary. If it's not necessary, I
can drop this method here.
>
>> + * @get_features: Get virtio features supported by the device
>> + * @vdev: vdpa device
>> + * Returns the virtio features support by the
>> + * device
>> + * @set_features: Set virtio features supported by the driver
>> + * @vdev: vdpa device
>> + * @features: feature support by the driver
>> + * Returns integer: success (0) or error (< 0)
>> + * @set_config_cb: Set the config interrupt callback
>> + * @vdev: vdpa device
>> + * @cb: virtio-vdev interrupt callback structure
>> + * @get_vq_num_max: Get the max size of virtqueue
>> + * @vdev: vdpa device
>> + * Returns u16: max size of virtqueue
>
> I'm not sure this has to be uniform across VQs.
Let me add an index parameter to this.
>
>> + * @get_device_id: Get virtio device id
>> + * @vdev: vdpa device
>> + * Returns u32: virtio device id
>
> is this the virtio ID? PCI ID?
Virtio ID
>
>> + * @get_vendor_id: Get id for the vendor that provides this device
>> + * @vdev: vdpa device
>> + * Returns u32: virtio vendor id
> what's the idea behind this? userspace normally doesn't interact with
> this ... debugging?
This allows some vendor specific driver on top of vDPA bus. If this is
not interested, I can drop this.
RJM>] wouldn't usage of
get_device_id &
get_vendor_id, beyond reporting, tend to possibly leading to vendor specific code in the framework instead of leaving the framework agnostic and leave the vendor specific stuff to the vendor's vDPA device driver?
>
>> + * @get_status: Get the device status
>> + * @vdev: vdpa device
>> + * Returns u8: virtio device status
>> + * @set_status: Set the device status
>> + * @vdev: vdpa device
>> + * @status: virtio device status
>> + * @get_config: Read from device specific configuration space
>> + * @vdev: vdpa device
>> + * @offset: offset from the beginning of
>> + * configuration space
>> + * @buf: buffer used to read to
>> + * @len: the length to read from
>> + * configuration space
>> + * @set_config: Write to device specific configuration space
>> + * @vdev: vdpa device
>> + * @offset: offset from the beginning of
>> + * configuration space
>> + * @buf: buffer used to write from
>> + * @len: the length to write to
>> + * configuration space
>> + * @get_generation: Get device config generation (optional)
>> + * @vdev: vdpa device
>> + * Returns u32: device generation
>> + * @set_map: Set device memory mapping, optional
>> + * and only needed for device that using
>> + * device specific DMA translation
>> + * (on-chip IOMMU)
>> + * @vdev: vdpa device
>> + * @iotlb: vhost memory mapping to be
>> + * used by the vDPA
>> + * Returns integer: success (0) or error (< 0)
> OK so any change just swaps in a completely new mapping?
> Wouldn't this make minor changes such as memory hotplug
> quite expensive?
My understanding is that the incremental updating of the on chip IOMMU
may degrade the performance. So vendor vDPA drivers may want to know
all the mappings at once. Technically, we can keep the incremental API
here and let the vendor vDPA drivers to record the full mapping
internally which may slightly increase the complexity of vendor driver.
We need more inputs from vendors here.
Thanks
>
>> + */
>> +struct vdpa_config_ops {
>> + /* Virtqueue ops */
>> + int (*set_vq_address)(struct vdpa_device *vdev,
>> + u16 idx, u64 desc_area, u64 driver_area,
>> + u64 device_area);
>> + void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
>> + void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
>> + void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
>> + struct vdpa_callback *cb);
>> + void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
>> + bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
>> + int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
>> + u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
>> +
>> + /* Device ops */
>> + u16 (*get_vq_align)(struct vdpa_device *vdev);
>> + u64 (*get_features)(struct vdpa_device *vdev);
>> + int (*set_features)(struct vdpa_device *vdev, u64 features);
>> + void (*set_config_cb)(struct vdpa_device *vdev,
>> + struct vdpa_callback *cb);
>> + u16 (*get_vq_num_max)(struct vdpa_device *vdev);
>> + u32 (*get_device_id)(struct vdpa_device *vdev);
>> + u32 (*get_vendor_id)(struct vdpa_device *vdev);
>> + u8 (*get_status)(struct vdpa_device *vdev);
>> + void (*set_status)(struct vdpa_device *vdev, u8 status);
>> + void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
>> + void *buf, unsigned int len);
>> + void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
>> + const void *buf, unsigned int len);
>> + u32 (*get_generation)(struct vdpa_device *vdev);
>> +
>> + /* Mem table */
>> + int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
>> +};
>> +
>> +int register_vdpa_device(struct vdpa_device *vdpa);
>> +void unregister_vdpa_device(struct vdpa_device *vdpa);
>> +
>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa);
>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent);
>> +
>> +struct vdpa_device *dev_to_vdpa(struct device *_dev);
>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa);
>> +
>> +/**
>> + * vdpa_driver - operations for a vDPA driver
>> + * @driver: underlying device driver
>> + * @probe: the function to call when a device is found. Returns 0 or -errno.
>> + * @remove: the function to call when a device is removed.
>> + */
>> +struct vdpa_driver {
>> + struct device_driver drv;
>> + int (*probe)(struct device *dev);
>> + void (*remove)(struct device *dev);
>> +};
>> +
>> +int register_vdpa_driver(struct vdpa_driver *drv);
>> +void unregister_vdpa_driver(struct vdpa_driver *drv);
>> +
>> +static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *drv)
>> +{
>> + return container_of(drv, struct vdpa_driver, drv);
>> +}
>> +
>> +#endif /* _LINUX_VDPA_H */
>> --
>> 2.19.1
_______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization