On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote: > This patch adds virtio-pmem driver for KVM guest. > Guest reads the persistent memory range information > over virtio bus from Qemu and reserves the range > as persistent memory. Guest also allocates a block > device corresponding to the pmem range which later > can be accessed with DAX compatible file systems. > Idea is to use the virtio channel between guest and > host to perform the block device flush for guest pmem > DAX device. > > There is work to do including DAX file system support > and other advanced features. > > Signed-off-by: Pankaj Gupta <pagupta@xxxxxxxxxx> > --- > drivers/virtio/Kconfig | 10 ++ > drivers/virtio/Makefile | 1 + > drivers/virtio/virtio_pmem.c | 322 +++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/virtio_pmem.h | 55 +++++++ > 4 files changed, 388 insertions(+) > create mode 100644 drivers/virtio/virtio_pmem.c > create mode 100644 include/uapi/linux/virtio_pmem.h > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > index cff773f15b7e..0192c4bda54b 100644 > --- a/drivers/virtio/Kconfig > +++ b/drivers/virtio/Kconfig > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY > > If unsure, say Y. > > +config VIRTIO_PMEM > + tristate "Virtio pmem driver" > + depends on VIRTIO > + ---help--- > + This driver adds persistent memory range within a KVM guest. > + It also associates a block device corresponding to the pmem > + range. > + > + If unsure, say M. > + > config VIRTIO_BALLOON > tristate "Virtio balloon driver" > depends on VIRTIO > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile > index 41e30e3dc842..032ade725cc2 100644 > --- a/drivers/virtio/Makefile > +++ b/drivers/virtio/Makefile > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o > virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o > obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o > obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c > new file mode 100644 > index 000000000000..74e47cae0e24 > --- /dev/null > +++ b/drivers/virtio/virtio_pmem.c > @@ -0,0 +1,322 @@ > +/* > + * virtio-pmem driver > + */ > + > +#include <linux/virtio.h> > +#include <linux/swap.h> > +#include <linux/workqueue.h> > +#include <linux/delay.h> > +#include <linux/slab.h> > +#include <linux/module.h> > +#include <linux/oom.h> > +#include <linux/wait.h> > +#include <linux/mm.h> > +#include <linux/mount.h> > +#include <linux/magic.h> > +#include <linux/virtio_pmem.h> > + > +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr) > +{ > + devm_memunmap(dev, addr); > + devm_release_mem_region(dev, res->start, resource_size(res)); > +} > + > +static void pmem_flush_done(struct virtqueue *vq) > +{ > + return; > +}; > + > +static void virtio_pmem_release_queue(void *q) > +{ > + blk_cleanup_queue(q); > +} > + > +static void virtio_pmem_freeze_queue(void *q) > +{ > + blk_freeze_queue_start(q); > +} > + > +static void virtio_pmem_release_disk(void *__pmem) > +{ > + struct virtio_pmem *pmem = __pmem; > + > + del_gendisk(pmem->disk); > + put_disk(pmem->disk); > +} > + > +static int init_vq(struct virtio_pmem *vpmem) > +{ > + struct virtqueue *vq; > + > + /* single vq */ > + vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue"); > + > + if (IS_ERR(vq)) > + return PTR_ERR(vq); > + > + return 0; > +} > + > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem, > + struct resource *res, struct vmem_altmap *altmap) > +{ > + u32 start_pad = 0, end_trunc = 0; > + resource_size_t start, size; > + unsigned long npfns; > + phys_addr_t offset; > + > + size = resource_size(res); > + start = PHYS_SECTION_ALIGN_DOWN(res->start); > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + start = res->start; > + start_pad = PHYS_SECTION_ALIGN_UP(start) - start; > + } > + start = res->start; > + size = PHYS_SECTION_ALIGN_UP(start + size) - start; > + > + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, > + IORES_DESC_NONE) == REGION_MIXED) { > + > + size = resource_size(res); > + end_trunc = start + size - > + PHYS_SECTION_ALIGN_DOWN(start + size); > + } > + > + start += start_pad; > + size = resource_size(res); > + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) > + / PAGE_SIZE); > + > + /* > + * vmemmap_populate_hugepages() allocates the memmap array in > + * HPAGE_SIZE chunks. > + */ > + offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start; > + vpmem->data_offset = offset; > + > + struct vmem_altmap __altmap = { > + .base_pfn = init_altmap_base(start+start_pad), > + .reserve = init_altmap_reserve(start+start_pad), > + }; > + > + res->start += start_pad; > + res->end -= end_trunc; > + memcpy(altmap, &__altmap, sizeof(*altmap)); > + altmap->free = PHYS_PFN(offset - SZ_8K); > + altmap->alloc = 0; The __altmap part can be rewritten using compound literal syntax: *altmap = (struct vmem_altmap) { .base_pfn = init_altmap_base(start+start_pad), .reserve = init_altmap_reserve(start+start_pad), .free = PHYS_PFN(offset - SZ_8K), }; This eliminates the temporary variable, memcpy, and explicit alloc = 0 initialization. > + > + return altmap; > +} > + > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page, > + unsigned int len, unsigned int off, bool is_write, > + sector_t sector) > +{ > + blk_status_t rc = BLK_STS_OK; > + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > + void *pmem_addr = pmem->virt_addr + pmem_off; > + > + if (!is_write) { > + rc = read_pmem(page, off, pmem_addr, len); > + flush_dcache_page(page); > + } else { > + flush_dcache_page(page); What are the semantics of this device? Why flush the dcache here if an explicit flush command needs to be sent over the virtqueue? > + write_pmem(pmem_addr, page, off, len); > + } > + > + return rc; > +} > + > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector, > + struct page *page, bool is_write) > +{ > + struct virtio_pmem *pmem = bdev->bd_queue->queuedata; > + blk_status_t rc; > + > + rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, > + 0, is_write, sector); > + > + if (rc == 0) > + page_endio(page, is_write, 0); > + > + return blk_status_to_errno(rc); > +} > + > +#ifndef REQ_FLUSH > +#define REQ_FLUSH REQ_PREFLUSH > +#endif Is this out-of-tree kernel module compatibility stuff that can be removed? > + > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q, > + struct bio *bio) > +{ > + blk_status_t rc = 0; > + struct bio_vec bvec; > + struct bvec_iter iter; > + struct virtio_pmem *pmem = q->queuedata; > + > + if (bio->bi_opf & REQ_FLUSH) > + //todo host flush command This detail is critical to the device design. What is the plan? > + > + bio_for_each_segment(bvec, bio, iter) { > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > + bvec.bv_offset, op_is_write(bio_op(bio)), > + iter.bi_sector); > + if (rc) { > + bio->bi_status = rc; > + break; > + } > + } > + > + bio_endio(bio); > + return BLK_QC_T_NONE; > +} > + > +static const struct block_device_operations pmem_fops = { > + .owner = THIS_MODULE, > + .rw_page = vpmem_rw_page, > + //.revalidate_disk = nvdimm_revalidate_disk, > +}; > + > +static int virtio_pmem_probe(struct virtio_device *vdev) > +{ > + struct virtio_pmem *vpmem; > + int err = 0; > + void *addr; > + struct resource *res, res_pfn; > + struct request_queue *q; > + struct vmem_altmap __altmap, *altmap = NULL; > + struct gendisk *disk; > + struct device *gendev; > + int nid = dev_to_node(&vdev->dev); > + > + if (!vdev->config->get) { > + dev_err(&vdev->dev, "%s failure: config disabled\n", > + __func__); > + return -EINVAL; > + } > + > + vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), > + GFP_KERNEL); > + > + if (!vpmem) { > + err = -ENOMEM; > + goto out; > + } > + > + dev_set_drvdata(&vdev->dev, vpmem); > + > + vpmem->vdev = vdev; > + err = init_vq(vpmem); > + if (err) > + goto out; > + > + if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) { > + dev_err(&vdev->dev, "%s : pmem not supported\n", > + __func__); > + goto out; > + } Why is VIRTIO_PMEM_PLUG optional for this new device type if it's already required by the first version of the driver? err is not set when the error path is taken. > + > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + start, &vpmem->start); > + virtio_cread(vpmem->vdev, struct virtio_pmem_config, > + size, &vpmem->size); The struct resource start and size fields can vary between architectures. Virtio device configuration space layout must not vary between architectures. Please use u64. > + > + res_pfn.start = vpmem->start; > + res_pfn.end = vpmem->start + vpmem->size-1; > + > + /* used for allocating memmap in the pmem device */ > + altmap = setup_pmem_pfn(vpmem, &res_pfn, &__altmap); > + > + res = devm_request_mem_region(&vdev->dev, > + res_pfn.start, resource_size(&res_pfn), "virtio-pmem"); > + > + if (!res) { > + dev_warn(&vdev->dev, "could not reserve region "); > + return -EBUSY; > + } > + > + q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev)); > + > + if (!q) > + return -ENOMEM; > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_queue, q)) > + return -ENOMEM; > + > + vpmem->pfn_flags = PFN_DEV; > + > + /* allocate memap in pmem device itself */ > + if (IS_ENABLED(CONFIG_ZONE_DEVICE)) { > + > + addr = devm_memremap_pages(&vdev->dev, res, > + &q->q_usage_counter, altmap); > + vpmem->pfn_flags |= PFN_MAP; > + } else > + addr = devm_memremap(&vdev->dev, vpmem->start, > + vpmem->size, ARCH_MEMREMAP_PMEM); > + > + /* > + * At release time the queue must be frozen before > + * devm_memremap_pages is unwound > + */ > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_freeze_queue, q)) > + return -ENOMEM; > + > + if (IS_ERR(addr)) > + return PTR_ERR(addr); > + > + vpmem->virt_addr = addr; > + blk_queue_write_cache(q, 0, 0); > + blk_queue_make_request(q, virtio_pmem_make_request); > + blk_queue_physical_block_size(q, PAGE_SIZE); > + blk_queue_logical_block_size(q, 512); > + blk_queue_max_hw_sectors(q, UINT_MAX); > + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); > + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); > + q->queuedata = vpmem; > + > + disk = alloc_disk_node(0, nid); > + if (!disk) > + return -ENOMEM; The return statements in this function look suspicious. There probably needs to be a cleanup to avoid memory leaks. > + vpmem->disk = disk; > + > + disk->fops = &pmem_fops; > + disk->queue = q; > + disk->flags = GENHD_FL_EXT_DEVT; > + strcpy(disk->disk_name, "vpmem"); > + set_capacity(disk, vpmem->size/512); > + gendev = disk_to_dev(disk); > + > + virtio_device_ready(vdev); > + device_add_disk(&vdev->dev, disk); > + > + if (devm_add_action_or_reset(&vdev->dev, > + virtio_pmem_release_disk, vpmem)) > + return -ENOMEM; > + > + revalidate_disk(disk); > + return 0; > +out: > + vdev->config->del_vqs(vdev); > + return err; > +} > + > +static struct virtio_driver virtio_pmem_driver = { > + .feature_table = features, > + .feature_table_size = ARRAY_SIZE(features), > + .driver.name = KBUILD_MODNAME, > + .driver.owner = THIS_MODULE, > + .id_table = id_table, > + .probe = virtio_pmem_probe, > + //.remove = virtio_pmem_remove, > +}; > + > +module_virtio_driver(virtio_pmem_driver); > +MODULE_DEVICE_TABLE(virtio, id_table); > +MODULE_DESCRIPTION("Virtio pmem driver"); > +MODULE_LICENSE("GPL"); > diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h > new file mode 100644 > index 000000000000..ec0c728c79ba > --- /dev/null > +++ b/include/uapi/linux/virtio_pmem.h > @@ -0,0 +1,55 @@ > +/* > + * Virtio pmem Device > + * > + * > + */ > + > +#ifndef _LINUX_VIRTIO_PMEM_H > +#define _LINUX_VIRTIO_PMEM_H > + > +#include <linux/types.h> > +#include <linux/virtio_types.h> > +#include <linux/virtio_ids.h> > +#include <linux/virtio_config.h> > +#include <linux/pfn_t.h> > +#include <linux/fs.h> > +#include <linux/blk-mq.h> > +#include <linux/pmem_common.h> include/uapi/ headers must compile when included from userspace applications. The header should define userspace APIs only. If you want to define kernel-internal APIs, please add them to include/linux/ or similar. This also means that include/uapi/ headers cannot include include/linux/pmem_common.h or other kernel headers outside #ifdef __KERNEL__. This header file should only contain struct virtio_pmem_config, VIRTIO_PMEM_PLUG, and other virtio device definitions. > + > +bool pmem_should_map_pages(struct device *dev); > + > +#define VIRTIO_PMEM_PLUG 0 > + > +struct virtio_pmem_config { > + > +uint64_t start; > +uint64_t size; > +uint64_t align; > +uint64_t data_offset; > +}; > + > +struct virtio_pmem { > + > + struct virtio_device *vdev; > + struct virtqueue *req_vq; > + > + uint64_t start; > + uint64_t size; > + uint64_t align; > + uint64_t data_offset; > + u64 pfn_flags; > + void *virt_addr; > + struct gendisk *disk; > +} __packed; > + > +static struct virtio_device_id id_table[] = { > + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, > + { 0 }, > +}; > + > +static unsigned int features[] = { > + VIRTIO_PMEM_PLUG, > +}; > + > +#endif > + > -- > 2.13.0 >