On Thu, Mar 26, 2015 at 1:32 AM, Christoph Hellwig <hch@xxxxxx> wrote: > From: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > > PMEM is a new driver that presents a reserved range of memory as a > block device. This is useful for developing with NV-DIMMs, and > can be used with volatile memory as a development platform. > > Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > [hch: convert to use a platform_device for discovery, fix partition > support] > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > Tested-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > --- > MAINTAINERS | 6 + > drivers/block/Kconfig | 13 ++ > drivers/block/Makefile | 1 + > drivers/block/pmem.c | 373 +++++++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 393 insertions(+) > create mode 100644 drivers/block/pmem.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 358eb01..efacf2b 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -8063,6 +8063,12 @@ S: Maintained > F: Documentation/blockdev/ramdisk.txt > F: drivers/block/brd.c > > +PERSISTENT MEMORY DRIVER > +M: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > +L: linux-nvdimm@xxxxxxxxxxxx > +S: Supported > +F: drivers/block/pmem.c > + > RANDOM NUMBER DRIVER > M: "Theodore Ts'o" <tytso@xxxxxxx> > S: Maintained > diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig > index 1b8094d..9284aaf 100644 > --- a/drivers/block/Kconfig > +++ b/drivers/block/Kconfig > @@ -404,6 +404,19 @@ config BLK_DEV_RAM_DAX > and will prevent RAM block device backing store memory from being > allocated from highmem (only a problem for highmem systems). > > +config BLK_DEV_PMEM > + tristate "Persistent memory block device support" > + help > + Saying Y here will allow you to use a contiguous range of reserved > + memory as one or more block devices. Memory for PMEM should be > + reserved using the "memmap" kernel parameter. > + > + To compile this driver as a module, choose M here: the module will be > + called pmem. > + > + Most normal users won't need this functionality, and can thus say N > + here. > + > config CDROM_PKTCDVD > tristate "Packet writing on CD/DVD media" > depends on !UML > diff --git a/drivers/block/Makefile b/drivers/block/Makefile > index 02b688d..9cc6c18 100644 > --- a/drivers/block/Makefile > +++ b/drivers/block/Makefile > @@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o > obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o > obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o > obj-$(CONFIG_BLK_DEV_RAM) += brd.o > +obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o > obj-$(CONFIG_BLK_DEV_LOOP) += loop.o > obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o > obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o > diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c > new file mode 100644 > index 0000000..545b13b > --- /dev/null > +++ b/drivers/block/pmem.c > @@ -0,0 +1,373 @@ > +/* > + * Persistent Memory Driver > + * Copyright (c) 2014, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * This driver is heavily based on drivers/block/brd.c. > + * Copyright (C) 2007 Nick Piggin > + * Copyright (C) 2007 Novell Inc. > + */ > + > +#include <asm/cacheflush.h> > +#include <linux/blkdev.h> > +#include <linux/hdreg.h> > +#include <linux/init.h> > +#include <linux/platform_device.h> > +#include <linux/module.h> > +#include <linux/moduleparam.h> > +#include <linux/slab.h> > + > +#define SECTOR_SHIFT 9 > +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) > +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) > + > +#define PMEM_MINORS 16 > + > +struct pmem_device { > + struct request_queue *pmem_queue; > + struct gendisk *pmem_disk; > + > + /* One contiguous memory region per device */ > + phys_addr_t phys_addr; > + void *virt_addr; > + size_t size; > +}; > + > +static int pmem_major; > +static atomic_t pmem_index; > + > +static int pmem_getgeo(struct block_device *bd, struct hd_geometry *geo) > +{ > + /* some standard values */ > + geo->heads = 1 << 6; > + geo->sectors = 1 << 5; > + geo->cylinders = get_capacity(bd->bd_disk) >> 11; > + return 0; > +} > + > +/* > + * direct translation from (pmem,sector) => void* > + * We do not require that sector be page aligned. > + * The return value will point to the beginning of the page containing the > + * given sector, not to the sector itself. > + */ > +static void *pmem_lookup_pg_addr(struct pmem_device *pmem, sector_t sector) > +{ > + size_t page_offset = sector >> PAGE_SECTORS_SHIFT; > + size_t offset = page_offset << PAGE_SHIFT; > + > + BUG_ON(offset >= pmem->size); > + return pmem->virt_addr + offset; > +} > + > +/* sector must be page aligned */ > +static unsigned long pmem_lookup_pfn(struct pmem_device *pmem, sector_t sector) > +{ > + size_t page_offset = sector >> PAGE_SECTORS_SHIFT; > + > + BUG_ON(sector & (PAGE_SECTORS - 1)); > + return (pmem->phys_addr >> PAGE_SHIFT) + page_offset; > +} > + > +/* > + * sector is not required to be page aligned. > + * n is at most a single page, but could be less. > + */ > +static void copy_to_pmem(struct pmem_device *pmem, const void *src, > + sector_t sector, size_t n) > +{ > + void *dst; > + unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; > + size_t copy; > + > + BUG_ON(n > PAGE_SIZE); > + > + copy = min_t(size_t, n, PAGE_SIZE - offset); > + dst = pmem_lookup_pg_addr(pmem, sector); > + memcpy(dst + offset, src, copy); > + > + if (copy < n) { > + src += copy; > + sector += copy >> SECTOR_SHIFT; > + copy = n - copy; > + dst = pmem_lookup_pg_addr(pmem, sector); > + memcpy(dst, src, copy); > + } > +} > + > +/* > + * sector is not required to be page aligned. > + * n is at most a single page, but could be less. > + */ > +static void copy_from_pmem(void *dst, struct pmem_device *pmem, > + sector_t sector, size_t n) > +{ > + void *src; > + unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; > + size_t copy; > + > + BUG_ON(n > PAGE_SIZE); > + > + copy = min_t(size_t, n, PAGE_SIZE - offset); > + src = pmem_lookup_pg_addr(pmem, sector); > + > + memcpy(dst, src + offset, copy); > + > + if (copy < n) { > + dst += copy; > + sector += copy >> SECTOR_SHIFT; > + copy = n - copy; > + src = pmem_lookup_pg_addr(pmem, sector); > + memcpy(dst, src, copy); > + } > +} > + > +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, > + unsigned int len, unsigned int off, int rw, > + sector_t sector) > +{ > + void *mem = kmap_atomic(page); > + > + if (rw == READ) { > + copy_from_pmem(mem + off, pmem, sector, len); > + flush_dcache_page(page); > + } else { > + /* > + * FIXME: Need more involved flushing to ensure that writes to > + * NVDIMMs are actually durable before returning. > + */ > + flush_dcache_page(page); > + copy_to_pmem(pmem, mem + off, sector, len); > + } > + > + kunmap_atomic(mem); > +} > + > +static void pmem_make_request(struct request_queue *q, struct bio *bio) > +{ > + struct block_device *bdev = bio->bi_bdev; > + struct pmem_device *pmem = bdev->bd_disk->private_data; > + int rw; > + struct bio_vec bvec; > + sector_t sector; > + struct bvec_iter iter; > + int err = 0; > + > + sector = bio->bi_iter.bi_sector; > + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) { > + err = -EIO; > + goto out; > + } > + > + BUG_ON(bio->bi_rw & REQ_DISCARD); > + > + rw = bio_rw(bio); > + if (rw == READA) > + rw = READ; > + > + bio_for_each_segment(bvec, bio, iter) { > + unsigned int len = bvec.bv_len; > + > + BUG_ON(len > PAGE_SIZE); > + pmem_do_bvec(pmem, bvec.bv_page, len, > + bvec.bv_offset, rw, sector); > + sector += len >> SECTOR_SHIFT; > + } > + > +out: > + bio_endio(bio, err); > +} > + > +static int pmem_rw_page(struct block_device *bdev, sector_t sector, > + struct page *page, int rw) > +{ > + struct pmem_device *pmem = bdev->bd_disk->private_data; > + > + pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); > + page_endio(page, rw & WRITE, 0); > + return 0; > +} > + > +static long pmem_direct_access(struct block_device *bdev, sector_t sector, > + void **kaddr, unsigned long *pfn, long size) > +{ > + struct pmem_device *pmem = bdev->bd_disk->private_data; > + > + if (!pmem) > + return -ENODEV; > + > + *kaddr = pmem_lookup_pg_addr(pmem, sector); > + *pfn = pmem_lookup_pfn(pmem, sector); > + > + return pmem->size - (sector * 512); > +} > + > +static const struct block_device_operations pmem_fops = { > + .owner = THIS_MODULE, > + .rw_page = pmem_rw_page, > + .direct_access = pmem_direct_access, > + .getgeo = pmem_getgeo, > +}; > + > +/* pmem->phys_addr and pmem->size need to be set. > + * Will then set virt_addr if successful. > + */ > +static int pmem_mapmem(struct pmem_device *pmem) > +{ > + struct resource *res_mem; > + int err; > + > + res_mem = request_mem_region_exclusive(pmem->phys_addr, pmem->size, > + "pmem"); Isn't request_mem_region() enough? i.e. it seems request_mem_region_exclusive() assumes no DAX, at least in theory? > + if (!res_mem) { > + pr_warn("pmem: request_mem_region_exclusive phys=0x%llx size=0x%zx failed\n", > + pmem->phys_addr, pmem->size); > + return -EINVAL; > + } > + > + /* > + * Map the memory as non-cachable, as we can't write back the contents > + * of the CPU caches in case of a crash. > + */ > + pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); This is fine for now, but I think we're going to end up with a continuum of solutions to this problem based on the platform and the device. Some ADR platforms have firmware that takes actions like flushing caches on a "power going away" signal. Other platforms have cache management instructions that we can use on either a per-i/o or per REQ_FUA/FLUSH request. Hmm, with this being in the memory map by default I think this poses a challenge for VIVT caches and aliased accesses? We can revisit this when arm support shows up. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html