[RFC PATCH 4/4] device-dax: Add a block device persistent type, BLK, for DAX KMEM

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When a DAX KMEM device is formatted as of type BLK, adding the DAX memory
exposes a block device /dev/kmem<numa_node> . A filesystem can be created
on this block device. Blocks which contain data are unavailable for use as
system memory, but blocks which are freed up using DISCARD become free for
system use.

The implementation uses an array which maps the logical block number
to the real block offset in the DAX device. This allows us to keep
block device semantics even though allocations can return any page.

Signed-off-by: Srinivas Aji <srinivas.aji@xxxxxxxxxxxx>
---
 drivers/dax/Makefile       |   1 +
 drivers/dax/kmem.c         |   4 +-
 drivers/dax/kmem_blk.c     | 573 +++++++++++++++++++++++++++++++++++++
 drivers/dax/kmem_persist.h |   4 +
 4 files changed, 581 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dax/kmem_blk.c

diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 90a56ca3b345..d0a97f4af4ea 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+obj-$(CONFIG_DEV_DAX_KMEM_PERSIST) += kmem_blk.o
 
 dax-y := super.o
 dax-y += bus.o
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 0ca6e14f7e73..0fa45d1ba9cc 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -534,8 +534,10 @@ static int __init dax_kmem_init(void)
 	if (rc)
 		kfree_const(kmem_name);
 #ifdef CONFIG_DEV_DAX_KMEM_PERSIST
-	if (rc == 0)
+	if (rc == 0) {
 		kmem_persist_type_register(&kmem_persist_none_ops);
+		kmem_persist_type_register(&kmem_persist_blk_ops);
+	}
 #endif
 	return rc;
 }
diff --git a/drivers/dax/kmem_blk.c b/drivers/dax/kmem_blk.c
new file mode 100644
index 000000000000..856b35713999
--- /dev/null
+++ b/drivers/dax/kmem_blk.c
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 MemVerge. All rights reserved. */
+#include <linux/module.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "dax-private.h"
+#include "kmem_persist.h"
+
+static const unsigned int index_entries_per_page = (PAGE_SIZE / sizeof(u64));
+
+struct kmem_blk_super {
+	struct kmem_persist_superblock header;
+	u64 num_index_pages;
+	u64 num_index_entries;
+} __packed;
+
+struct kmem_blk_data {
+	struct dev_dax *dev_dax;
+	struct gendisk *disk;
+	spinlock_t index_lock;
+	struct kmem_blk_super *super;
+	unsigned long num_index_pages;
+	u64 *index_page[];
+};
+
+// TODO: Make sure locking is sound for concurrent multiple IOs,
+// i.e. writes and discards.
+
+static struct page *kmem_blk_get_page(struct kmem_blk_data *data,
+				sector_t sector)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	u64 page_num;
+
+	spin_lock(&data->index_lock);
+	page_num = data->index_page
+		[i / index_entries_per_page]
+		[i % index_entries_per_page];
+	spin_unlock(&data->index_lock);
+
+	if (page_num)
+		return dax_kmem_index_to_page(page_num, data->dev_dax);
+	else
+		return NULL;
+}
+
+/*
+ * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may sleep.
+ */
+static int kmem_blk_insert_page(struct kmem_blk_data *data, sector_t sector)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	struct page *page;
+	unsigned long page_index = 0;
+	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
+	u64 *index_ptr =
+		&data->index_page
+		[i / index_entries_per_page][i % index_entries_per_page];
+
+	/* Check if block exists */
+	spin_lock(&data->index_lock);
+	page_num = *index_ptr;
+	spin_unlock(&data->index_lock);
+	if (page_num)
+		return 0;
+
+	page = dax_kmem_alloc_page(data->dev_dax, &page_index);
+	if (!page) {
+		dev_err(&data->dev_dax->dev, "Cannot allocate page\n");
+		return -1;
+	}
+
+	spin_lock(&data->index_lock);
+	if (*index_ptr != 0)
+		__free_page(page);
+	else
+		*index_ptr = page_index;
+	spin_unlock(&data->index_lock);
+
+	return 0;
+}
+
+static int kmem_blk_discard(struct kmem_blk_data *data,
+			sector_t sector, size_t n)
+{
+	pgoff_t i = sector >> PAGE_SECTORS_SHIFT;
+	struct page *page;
+	u64 page_num; // TODO fixup u64 / unsigned long to use one type?
+	u64 *index_ptr;
+
+	BUG_ON(sector & ((1 << PAGE_SECTORS_SHIFT) - 1));
+	BUG_ON(n & (PAGE_SIZE - 1));
+
+	while (n > 0) {
+		BUG_ON(i > data->super->num_index_entries);
+		index_ptr =
+			&data->index_page
+			[i / index_entries_per_page]
+			[i % index_entries_per_page];
+		spin_lock(&data->index_lock);
+		page_num = *index_ptr;
+		if (page_num)
+			*index_ptr = 0;
+		spin_unlock(&data->index_lock);
+		if (page_num) {
+			page = dax_kmem_index_to_page(page_num, data->dev_dax);
+			__free_page(page);
+		}
+		i++;
+		n -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+/*
+ * copy_to_kmem_blk_setup must be called before copy_to_kmem_blk. It may sleep.
+ */
+static int copy_to_kmem_blk_setup(struct kmem_blk_data *data, sector_t sector, size_t n)
+{
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	if (kmem_blk_insert_page(data, sector))
+		return -ENOSPC;
+	if (copy < n) {
+		sector += copy >> SECTOR_SHIFT;
+		if (kmem_blk_insert_page(data, sector))
+			return -ENOSPC;
+	}
+	return 0;
+}
+
+/*
+ * Copy n bytes from src to the block device starting at sector. Does not sleep.
+ */
+static void copy_to_kmem_blk(struct kmem_blk_data *data, const void *src,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *dst;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = kmem_blk_get_page(data, sector);
+	BUG_ON(!page);
+
+	dst = kmap_atomic(page);
+	memcpy(dst + offset, src, copy);
+	kunmap_atomic(dst);
+
+	if (copy < n) {
+		src += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = kmem_blk_get_page(data, sector);
+		BUG_ON(!page);
+
+		dst = kmap_atomic(page);
+		memcpy(dst, src, copy);
+		kunmap_atomic(dst);
+	}
+}
+
+/*
+ * Copy n bytes to dst from the block device starting at sector. Does not sleep.
+ */
+static void copy_from_kmem_blk(void *dst, struct kmem_blk_data *data,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *src;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = kmem_blk_get_page(data, sector);
+	if (page) {
+		src = kmap_atomic(page);
+		memcpy(dst, src + offset, copy);
+		kunmap_atomic(src);
+	} else
+		memset(dst, 0, copy);
+
+	if (copy < n) {
+		dst += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = kmem_blk_get_page(data, sector);
+		if (page) {
+			src = kmap_atomic(page);
+			memcpy(dst, src, copy);
+			kunmap_atomic(src);
+		} else
+			memset(dst, 0, copy);
+	}
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static int kmem_blk_do_bvec(struct kmem_blk_data *data, struct page *page,
+			unsigned int len, unsigned int off, unsigned int op,
+			sector_t sector)
+{
+	void *mem = NULL;
+	int err = 0;
+
+	if (op == REQ_OP_WRITE) {
+		err = copy_to_kmem_blk_setup(data, sector, len);
+		if (err)
+			goto out;
+	}
+
+	if (page)
+		mem = kmap_atomic(page);
+	switch (op) {
+	case REQ_OP_READ:
+		copy_from_kmem_blk(mem + off, data, sector, len);
+		flush_dcache_page(page);
+		break;
+	case REQ_OP_WRITE:
+		flush_dcache_page(page);
+		copy_to_kmem_blk(data, mem + off, sector, len);
+		break;
+	case REQ_OP_DISCARD:
+		BUG_ON(page);
+		kmem_blk_discard(data, sector, len);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	if (mem)
+		kunmap_atomic(mem);
+
+out:
+	return err;
+}
+
+static void kmem_blk_submit_bio(struct bio *bio)
+{
+	struct kmem_blk_data *data = bio->bi_bdev->bd_disk->private_data;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/*
+	 * DISCARD and WRITE_ZEROES come separately and don't work with
+	 * bio_for_each_segment
+	 */
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		kmem_blk_discard(data, sector, bio->bi_iter.bi_size);
+		bio_endio(bio);
+		return;
+	default:
+		break;
+	}
+
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		int err;
+
+		/* Don't support un-aligned buffer */
+		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
+				(len & (SECTOR_SIZE - 1)));
+		err = kmem_blk_do_bvec(data, bvec.bv_page, len, bvec.bv_offset,
+				bio_op(bio), sector);
+		if (err) {
+			bio_io_error(bio);
+			return;
+		}
+		sector += len >> SECTOR_SHIFT;
+	}
+
+	bio_endio(bio);
+}
+
+static int kmem_blk_rw_page(struct block_device *bdev, sector_t sector,
+			struct page *page, unsigned int op)
+{
+	struct kmem_blk_data *data = bdev->bd_disk->private_data;
+	int err;
+
+	if (PageTransHuge(page))
+		return -EOPNOTSUPP;
+	err = kmem_blk_do_bvec(data, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op_is_write(op), err);
+	return err;
+}
+
+static const struct block_device_operations kmem_blk_fops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		kmem_blk_submit_bio,
+	.rw_page =		kmem_blk_rw_page,
+};
+
+
+
+
+
+static int kmem_blk_disk_init(struct kmem_blk_data *data)
+{
+	struct gendisk *disk;
+	int err;
+
+	disk = blk_alloc_disk(data->dev_dax->target_node);
+	data->disk = disk;
+
+	disk->flags = GENHD_FL_NO_PART;
+	disk->fops = &kmem_blk_fops;
+	disk->private_data = data;
+	snprintf(disk->disk_name, DISK_NAME_LEN, "kmem%d",
+		data->dev_dax->target_node);
+
+	set_capacity(disk,
+		data->super->num_index_entries << PAGE_SECTORS_SHIFT);
+
+	// TODO: Handle cases where PAGE_SIZE is too big.
+	/* Set physical and logical block size to PAGE_SIZE */
+	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(disk->queue, PAGE_SIZE);
+
+	/* Tell the block layer that this is not a rotational device */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	/* Don't use this for randomness */
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+
+	/* Support discard */
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, disk->queue);
+	disk->queue->limits.discard_granularity = PAGE_SIZE;
+	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+	/* We can handle WRITE_ZEROES as DISCARD, at units of page size */
+	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX);
+
+	err = add_disk(disk);
+	if (err)
+		goto out_cleanup_disk;
+
+	return 0;
+out_cleanup_disk:
+	blk_cleanup_disk(data->disk);
+	data->disk = NULL;
+	return err;
+}
+
+
+static void kmem_blk_disk_cleanup(struct kmem_blk_data *data)
+{
+	if (data->disk == NULL)
+		return;
+	del_gendisk(data->disk);
+	blk_cleanup_disk(data->disk);
+	data->disk = NULL;
+}
+
+/* Format device with full allocation */
+static int kmem_blk_format(struct dev_dax *dev_dax)
+{
+	struct kmem_blk_super *super =
+		kmap_local_page(dax_kmem_index_to_page(0, dev_dax));
+
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+	u64 i;
+	/*
+	 * c = a / b => c is largest c s.t. c * b <= a.
+	 * c = (a + b - 1) / b is smallest c s.t. c * b >= a
+	 * num_index_pages is the largest number such that
+	 * 1 + num_index_pages + num_index_pages * index_entries_per_page >= num_pages
+	 * num_index_pages *(1 + index_entries_per_page) >= num_pages - 1
+	 * num_index_pages =
+	 *   ((num_pages - 1) + (1 + index_entries_per_page) - 1 ) /
+	 *   (1 + index_entries_per_page)
+	 */
+	u64 num_index_pages =
+		(num_pages + index_entries_per_page - 1) /
+		(1 + index_entries_per_page);
+	super->header.magic = kmem_persist_magic;
+	super->header.type = KMEM_PERSIST_BLK;
+	super->num_index_pages = num_index_pages;
+	super->num_index_entries = num_pages - 1 - num_index_pages;
+
+	for (i = 0; i < num_index_pages; i++) {
+		u64 *index_array =
+			kmap_local_page(dax_kmem_index_to_page(1 + i, dev_dax));
+#if !defined(KMEM_PERSIST_BLK_FORMAT_FULL)
+		memset(index_array, 0, PAGE_SIZE);
+#else /* KMEM_PERSIST_BLK_FORMAT_FULL */
+		u64 j;
+
+		for (j = 0; j < index_entries_per_page; j++) {
+			u64 idx =
+				1 + num_index_pages +
+				i * index_entries_per_page + j;
+
+			if (idx >= num_pages)
+				idx = 0;
+			index_array[j] = idx;
+		}
+#endif
+		kunmap_local(index_array);
+	}
+	kunmap_local(super);
+	return 0;
+}
+
+/* Free unused blocks in the dax memory to system */
+static int kmem_blk_free_unused(struct kmem_blk_data *data)
+{
+	struct kmem_blk_super *super = data->super;
+	unsigned long num_pages = dax_kmem_num_pages(data->dev_dax);
+	u64 *alloc_bitmap;
+	unsigned long i;
+
+	/* Bitmap for tracking allocated pages. Temporary */
+	alloc_bitmap =
+		kvzalloc(sizeof(u64) * BITS_TO_U64(num_pages), GFP_KERNEL);
+	if (alloc_bitmap == NULL) {
+		dev_err(&data->dev_dax->dev,
+			"Unable to allocate bit array. Not freeing unused space.\n");
+		return -ENOMEM;
+	}
+
+	/* Free up pages unused by block storage to memory */
+	for (i = 0; i < super->num_index_entries; i++) {
+		u64 page_num = data->index_page
+			[i / index_entries_per_page]
+			[i % index_entries_per_page];
+
+		if (page_num != 0) {
+			BUG_ON(page_num < 1 + super->num_index_pages ||
+				page_num >= num_pages);
+			/* Set bit */
+			alloc_bitmap[page_num / 64] |= 1 << (page_num % 64);
+		}
+	}
+
+	for (i = 1 + super->num_index_pages; i < num_pages; i++) {
+		struct page *page;
+
+		if (!(alloc_bitmap[i / 64] & (1 << (i % 64)))) {
+			/* Bit clear. Page not used */
+			page = dax_kmem_index_to_page(i, data->dev_dax);
+			__free_page(page);
+		}
+	}
+
+	kvfree(alloc_bitmap);
+	return 0;
+}
+
+static int kmem_blk_probe(struct dev_dax *dev_dax, void **persist_data)
+{
+	struct device *dev = &dev_dax->dev;
+	struct kmem_blk_super *super;
+	unsigned long i;
+	struct kmem_blk_data *data;
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+
+	if (num_pages == 0) {
+		dev_err(dev, "Dax device for KMEM has no pages\n");
+		*persist_data = NULL;
+		return -1;
+	}
+
+	super = kmap(dax_kmem_index_to_page(0, dev_dax));
+
+	/* Validate superblock magic and type */
+	if (super->header.magic != kmem_persist_magic ||
+		super->header.type != KMEM_PERSIST_BLK) {
+		dev_err("KMEM not formatted for blk, magic %lx type %d\n",
+			super->header.magic, super->header.type);
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -EINVAL;
+	}
+
+	/* Validate superblock index page counts */
+	if (super->num_index_entries <=
+		super->num_index_pages * index_entries_per_page &&
+		1 + super->num_index_pages + super->num_index_entries
+		== num_pages) {
+		dev_info(dev,
+			"Found kmem_blk superblock num_index_entries %llu num_index_pages %llu num_pages %lu\n",
+			super->num_index_entries,
+			super->num_index_pages, num_pages);
+	} else {
+		dev_warn(dev,
+			"Invalid kmem_blk superblock num_index_entries %llu num_index_pages %llu num_pages %lu\n",
+			super->num_index_entries,
+			super->num_index_pages, num_pages);
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -EINVAL;
+	}
+
+	data = kzalloc(struct_size(data, index_page, super->num_index_pages),
+		GFP_KERNEL);
+	if (!data) {
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		*persist_data = NULL;
+		return -ENOMEM;
+	}
+
+	*persist_data = data;
+	data->dev_dax = dev_dax;
+	data->super = super;
+	spin_lock_init(&data->index_lock);
+
+	for (i = 0; i < super->num_index_pages; i++)
+		data->index_page[i] =
+			kmap(dax_kmem_index_to_page(i + 1, dev_dax));
+
+	kmem_blk_free_unused(data);
+
+	kmem_blk_disk_init(data);
+
+	return 0;
+}
+
+static int kmem_blk_cleanup(struct dev_dax *dev_dax, void *persist_data)
+{
+	struct kmem_blk_data *data = persist_data;
+	unsigned long num_pages = dax_kmem_num_pages(dev_dax);
+	unsigned long i;
+
+	if (data == NULL)
+		return -1;
+
+	kmem_blk_disk_cleanup(data);
+
+	if (data->super == 0) {
+		for (i = 0; i < num_pages; i++)
+			__free_page(dax_kmem_index_to_page(i, dev_dax));
+	} else {
+		for (i = 0; i < data->super->num_index_entries; i++) {
+			u64 page_num = data->index_page
+				[i / index_entries_per_page]
+				[i % index_entries_per_page];
+			if (page_num != 0) {
+				__free_page(dax_kmem_index_to_page(page_num,
+								   dev_dax));
+			}
+		}
+		for (i = 0; i < data->super->num_index_pages; i++) {
+			struct page *page =
+				dax_kmem_index_to_page(1 + i, dev_dax);
+			data->index_page[i] = NULL;
+			kunmap(page);
+			__free_page(page);
+		}
+		data->super = NULL;
+		kunmap(dax_kmem_index_to_page(0, dev_dax));
+		__free_page(dax_kmem_index_to_page(0, dev_dax));
+	}
+	kfree(data);
+	return 0;
+}
+
+struct kmem_persist_ops kmem_persist_blk_ops = {
+	.type = KMEM_PERSIST_BLK,
+	.format = kmem_blk_format,
+	.probe = kmem_blk_probe,
+	.cleanup = kmem_blk_cleanup
+};
diff --git a/drivers/dax/kmem_persist.h b/drivers/dax/kmem_persist.h
index dd651025f28c..0e0279feaa12 100644
--- a/drivers/dax/kmem_persist.h
+++ b/drivers/dax/kmem_persist.h
@@ -10,6 +10,7 @@ struct dev_dax;
 
 enum kmem_persist_type {
 	KMEM_PERSIST_NONE = 0,
+	KMEM_PERSIST_BLK,
 };
 
 
@@ -40,4 +41,7 @@ unsigned long dax_kmem_num_pages(struct dev_dax *dev_dax);
 struct page *dax_kmem_alloc_page(struct dev_dax *dev_dax,
 				unsigned long *page_index);
 
+/* Defined in kmem_blk.c */
+extern struct kmem_persist_ops kmem_persist_blk_ops;
+
 #endif
-- 
2.30.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux