[PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@xxxxxxxxxxxx>
Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx>
---
 MAINTAINERS            |   7 ++
 drivers/block/Kconfig  |  27 ++++
 drivers/block/Makefile |   1 +
 drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+)
 create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M:	Stephen Bates <sbates@xxxxxxxxxxxx>
+L:	linux-block@xxxxxxxxxxxxxxx
+S:	Maintained
+F:	drivers/block/iopmem.c
+F:	Documentation/blockdev/iopmem.txt
+
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@xxxxxxxxxxxxxxxx>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.

+config BLK_DEV_IOPMEM
+	tristate "Persistent block device backed by PCIe Memory"
+	depends on ZONE_DEVICE
+	default n
+	help
+	  Say Y here if you want to include a generic device driver
+	  that can create a block device from persistent PCIe attached
+	  IO memory.
+
+	  To compile this driver as a module, choose M here: The
+	  module will be called iopmem. A block device will be created
+	  for each PCIe attached device that matches the vendor and
+	  device ID as specified in the module. Alternativel this
+	  driver can be bound to any aribtary PCIe function using the
+	  sysfs bind entry.
+
+	  This block device supports direct access (DAX) file systems
+	  and supports struct page backing for the IO Memory. This
+	  makes the underlying memory suitable for things like RDMA
+	  Memory Regions and Direct IO which is useful for PCIe
+	  peer-to-peer DMA operations.
+
+	  Note that persistent is only assured if the memory on the
+	  PCIe card has some form of power loss protection. This could
+	  be provided via some form of battery, a supercap/NAND combo
+	  or some exciting new persistent memory technology.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM)	+= iopmem.o

 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+	{ PCI_DEVICE(0x11f8, 0xf115) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct device *dev;
+
+	int instance;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+  /*
+   * We can only access the iopmem device with full 32-bit word
+   * accesses which cannot be gaurantee'd by the regular memcpy
+   */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+	u64 *wdst = dst;
+	const u64 *wsrc = src;
+	u64 tmp;
+
+	while (sz >= sizeof(*wdst)) {
+		*wdst++ = *wsrc++;
+		sz -= sizeof(*wdst);
+	}
+
+	if (!sz)
+		return;
+
+	tmp = *wsrc;
+	memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+		       unsigned int off, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy(iopmem_addr, mem + off, len);
+	kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+			void *iopmem_addr, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy_from_iopmem(mem + off, iopmem_addr, len);
+	kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+			   unsigned int len, unsigned int off, bool is_write,
+			   sector_t sector)
+{
+	phys_addr_t iopmem_off = sector * 512;
+	void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+	if (!is_write) {
+		read_iopmem(page, off, iopmem_addr, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_iopmem(iopmem_addr, page, off, len);
+	}
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct iopmem_device *iopmem = q->queuedata;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+			    bvec.bv_offset, op_is_write(bio_op(bio)),
+			    iter.bi_sector);
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+	iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+	page_endio(page, is_write, 0);
+	return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+			       void **kaddr, pfn_t *pfn, long size)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512;
+
+	if (!iopmem)
+		return -ENODEV;
+
+	*kaddr = iopmem->virt_addr + offset;
+	 *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+	return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		iopmem_rw_page,
+	.direct_access =	iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&ida_lock);
+		error = ida_get_new(&iopmem_instance_ida, &instance);
+		spin_unlock(&ida_lock);
+
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	iopmem->instance = instance;
+	return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+	spin_lock(&ida_lock);
+	ida_remove(&iopmem_instance_ida, iopmem->instance);
+	spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+	struct gendisk *disk;
+	int nid = dev_to_node(iopmem->dev);
+	struct request_queue *q = iopmem->queue;
+
+	blk_queue_write_cache(q, true, true);
+	blk_queue_make_request(q, iopmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = iopmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (unlikely(!disk))
+		return -ENOMEM;
+
+	disk->fops		= &iopmem_fops;
+	disk->queue		= q;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+	set_capacity(disk, iopmem->size / 512);
+	iopmem->disk = disk;
+
+	device_add_disk(iopmem->dev, disk);
+	revalidate_disk(disk);
+
+	return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+	del_gendisk(iopmem->disk);
+	put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct iopmem_device *iopmem;
+	struct device *dev;
+	int err = 0;
+	int nid = dev_to_node(&pdev->dev);
+
+	if (pci_enable_device_mem(pdev) < 0) {
+		dev_err(&pdev->dev, "unable to enable device!\n");
+		goto out;
+	}
+
+	iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+	if (unlikely(!iopmem)) {
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+	iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+	iopmem->dev = dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, iopmem);
+
+	err = iopmem_set_instance(iopmem);
+	if (err)
+		goto out_put_device;
+
+	dev_info(dev, "bar space 0x%llx len %lld\n",
+		(unsigned long long) iopmem->phys_addr,
+		(unsigned long long) iopmem->size);
+
+	if (!devm_request_mem_region(dev, iopmem->phys_addr,
+				     iopmem->size, dev_name(dev))) {
+		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+			 &iopmem->phys_addr, iopmem->size);
+		err = -EBUSY;
+		goto out_release_instance;
+	}
+
+	iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+	if (!iopmem->queue) {
+		err = -ENOMEM;
+		goto out_release_instance;
+	}
+
+	iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+				&iopmem->queue->q_usage_counter,
+				NULL, MEMREMAP_WC);
+	if (IS_ERR(iopmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_free_queue;
+	}
+
+	err = iopmem_attach_disk(iopmem);
+	if (err)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+	iopmem_release_instance(iopmem);
+out_put_device:
+	put_device(&pdev->dev);
+	kfree(iopmem);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+	struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+	blk_set_queue_dying(iopmem->queue);
+	iopmem_detach_disk(iopmem);
+	blk_cleanup_queue(iopmem->queue);
+	iopmem_release_instance(iopmem);
+	put_device(iopmem->dev);
+	kfree(iopmem);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+	.name = "iopmem",
+	.id_table = iopmem_id_table,
+	.probe = iopmem_probe,
+	.remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&iopmem_pci_driver);
+	if (rc)
+		return rc;
+
+	pr_info("iopmem: module loaded\n");
+	return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+	pci_unregister_driver(&iopmem_pci_driver);
+	pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@xxxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]