[RFC PATCH 2/4] device-dax: Add framework for keeping persistent data in DAX KMEM

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



DAX memory treated as system RAM will be checked for persistent data
and passed to the appropriate plugin. The plugin is provided with
functions for accessing pages by device offset, and also for
allocating and freeing pages from the DAX device provided memory.

Add a config option CONFIG_DEV_DAX_KMEM_PERSIST which controls this
feature.

The plugin framework allows multiple formats for the persistent data.
The contents of the initial block determine which plugin is used.

A module parameter, persist_format_type, to the kmem module sets the
plugin type which is used to format any hotplugged DAX which does not
have a recognized initial block.

Note: With just this change but without futher patches which implement
plugins for persistence, adding a DAX device as KMEM will only add
it as fully allocated.

Limitation: Adding a DAX device as KMEM will succeed only if this memory
is the only memory in its NUMA node.

Signed-off-by: Srinivas Aji <srinivas.aji@xxxxxxxxxxxx>
---
 drivers/dax/Kconfig        |  13 ++
 drivers/dax/kmem.c         | 266 ++++++++++++++++++++++++++++++++++++-
 drivers/dax/kmem_persist.h |  43 ++++++
 3 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 drivers/dax/kmem_persist.h

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 5fdf269a822e..837178b841b6 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -66,4 +66,17 @@ config DEV_DAX_KMEM
 
 	  Say N if unsure.
 
+config DEV_DAX_KMEM_PERSIST
+	tristate "KMEM PERSIST: persistent storage together with kmem"
+	default DEV_DAX_KMEM
+	depends on DEV_DAX_KMEM
+	help
+	  Support using a DAX device as system memory while also allowing
+          persistent data on it. This is done by treating all the
+          persistent data as pre-allocated when we add the DAX device
+          as system memory and further acquiring and releasing persistent
+          data through memory allocate and free.
+
+	  Say N if unsure.
+
 endif
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index a37622060fff..df7cfc8ace78 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -13,6 +13,9 @@
 #include <linux/mman.h>
 #include "dax-private.h"
 #include "bus.h"
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+#include "kmem_persist.h"
+#endif
 
 /* Memory resource name used for add_memory_driver_managed(). */
 static const char *kmem_name;
@@ -38,9 +41,23 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
 struct dax_kmem_data {
 	const char *res_name;
 	int mgid;
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+	unsigned long total_len;
+	struct kmem_persist_ops *persist_ops;
+	void *persist_data;
+#endif
 	struct resource *res[];
 };
 
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+static int kmem_persist_probe(struct dev_dax *dev_dax,
+			struct kmem_persist_ops **persist_ops,
+			void **persist_data);
+static int kmem_persist_cleanup(struct dev_dax *dev_dax,
+				struct kmem_persist_ops *persist_ops,
+				void *persist_data);
+#endif
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
 	struct device *dev = &dev_dax->dev;
@@ -48,6 +65,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	struct dax_kmem_data *data;
 	int i, rc, mapped = 0;
 	int numa_node;
+	mhp_t mhp_flags;
 
 	/*
 	 * Ensure good NUMA information for the persistent memory.
@@ -62,6 +80,18 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 		return -EINVAL;
 	}
 
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+	/*
+	 * Check if NUMA node has any memory already
+	 */
+	if (node_online(numa_node) && node_present_pages(numa_node) != 0) {
+		dev_warn(dev,
+			"rejecting DAX region on numa_node with existing memory: numa_node %d, existing pages %lu\n",
+			numa_node, node_present_pages(numa_node));
+		return -EINVAL;
+	}
+#endif
+
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct range range;
 
@@ -92,6 +122,15 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	if (rc < 0)
 		goto err_reg_mgid;
 	data->mgid = rc;
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+	data->total_len = total_len;
+#endif
+
+	mhp_flags = MHP_NID_IS_MGID
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+		| MHP_ALLOCATE
+#endif
+		;
 
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct resource *res;
@@ -130,8 +169,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 		 * this as RAM automatically.
 		 */
 		rc = add_memory_driver_managed(data->mgid, range.start,
-				range_len(&range), kmem_name, MHP_NID_IS_MGID);
-
+				range_len(&range), kmem_name, mhp_flags);
 		if (rc) {
 			dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
 					i, range.start, range.end);
@@ -147,6 +185,14 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 
 	dev_set_drvdata(dev, data);
 
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+	rc = kmem_persist_probe(dev_dax,
+				&data->persist_ops,
+				&data->persist_data);
+	if (rc)
+		dev_err(dev, "Cannot setup kmem persistent data\n");
+#endif
+
 	return 0;
 
 err_request_mem:
@@ -165,6 +211,18 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 	struct device *dev = &dev_dax->dev;
 	struct dax_kmem_data *data = dev_get_drvdata(dev);
 
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+	/*
+	 * TODO:This is probably the wrong place to call this. We need to
+	 * call this before the blocks are marked offline, but after
+	 * ensuring no new allocations.
+	 */
+	if (kmem_persist_cleanup(dev_dax, data->persist_ops,
+					data->persist_data)) {
+		dev_err(dev, "Block device cannot be freed.\n");
+		return;
+	}
+#endif
 	/*
 	 * We have one shot for removing memory, if some memory blocks were not
 	 * offline prior to calling this function remove_memory() will fail, and
@@ -214,6 +272,210 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+#ifdef CONFIG_DEV_DAX_KMEM_PERSIST
+struct page *dax_kmem_index_to_page(unsigned long page_index,
+				struct dev_dax *dev_dax)
+{
+	struct device *dev = &dev_dax->dev;
+	struct dax_kmem_data *data = dev_get_drvdata(dev);
+	int i;
+	unsigned long page_offset = 0;
+
+	for (i = 0; i < dev_dax->nr_range; i++) {
+		struct resource *r = data->res[i];
+		unsigned long page_len = (r->end + 1 - r->start) >> PAGE_SHIFT;
+
+		if (page_offset + page_len <= page_index) {
+			page_offset += page_len;
+			continue;
+		}
+		return pfn_to_page((r->start >> PAGE_SHIFT) +
+				(page_index - page_offset));
+	}
+	return NULL;
+}
+
+unsigned long dax_kmem_num_pages(struct dev_dax *dev_dax)
+{
+	struct device *dev = &dev_dax->dev;
+	struct dax_kmem_data *data = dev_get_drvdata(dev);
+
+	return data->total_len >> PAGE_SHIFT;
+}
+
+struct page *dax_kmem_alloc_page(struct dev_dax *dev_dax,
+				unsigned long *page_index)
+{
+	struct device *dev = &dev_dax->dev;
+	struct dax_kmem_data *data = dev_get_drvdata(dev);
+	int i;
+	unsigned long page_offset = 0;
+	u64 phys;
+	struct page *page =
+		alloc_pages_node(dev_dax->target_node,
+				GFP_NOIO | __GFP_ZERO | __GFP_THISNODE,
+				0);
+	if (!page)
+		return NULL;
+
+	phys = __pfn_to_phys(page_to_pfn(page));
+
+	for (i = 0; i < dev_dax->nr_range; i++) {
+		struct resource *r = data->res[i];
+		unsigned long page_len = (r->end + 1 - r->start) >> PAGE_SHIFT;
+
+		if (phys >= r->start && phys <= r->end) {
+			*page_index =
+				page_offset + ((phys - r->start) >> PAGE_SHIFT);
+			break;
+		}
+		page_offset += page_len;
+	}
+	if (i == dev_dax->nr_range) {
+		dev_err(dev, "Allocated page not in DAX range. Freeing.\n");
+		__free_page(page);
+		page = NULL;
+	}
+
+	return page;
+}
+
+static int persist_format_type = -1;
+module_param(persist_format_type, int, 0644);
+
+/*
+ * Forcibly format new KMEM with persist_format_type. This can cause loss
+ * of existing persistent data, so this should be replaced with some
+ * other mechanism for reformatting.
+ */
+static bool persist_format_force;
+module_param(persist_format_force, bool, 0644);
+
+static LIST_HEAD(persist_types);
+static DEFINE_MUTEX(persist_types_lock);
+
+int kmem_persist_type_register(struct kmem_persist_ops *ops)
+{
+	mutex_lock(&persist_types_lock);
+	ops->ref_count = 0;
+	list_add_tail(&ops->next, &persist_types);
+	mutex_unlock(&persist_types_lock);
+	return 0;
+}
+
+int kmem_persist_type_unregister(struct kmem_persist_ops *ops)
+{
+	mutex_lock(&persist_types_lock);
+	if (ops->ref_count != 0) {
+		mutex_unlock(&persist_types_lock);
+		return -1;
+	}
+	list_del(&ops->next);
+	mutex_unlock(&persist_types_lock);
+	return 0;
+}
+
+int kmem_persist_probe(struct dev_dax *dev_dax,
+		struct kmem_persist_ops **persist_ops,
+		void **persist_data)
+{
+	struct device *dev = &dev_dax->dev;
+	struct kmem_persist_superblock *super;
+	enum kmem_persist_type ptype;
+	bool format = false;
+	bool ptype_found = false;
+	struct kmem_persist_ops *ops;
+	void *data;
+	struct list_head *pos;
+	int rc;
+
+	super = kmap_local_page(dax_kmem_index_to_page(0, dev_dax));
+
+	if (super->magic != kmem_persist_magic) {
+		if (persist_format_type == -1) {
+			dev_err(dev, "kmem unformatted for persistence\n");
+			kunmap_local(super);
+			return -EINVAL;
+		}
+		ptype = persist_format_type;
+		format = true;
+	} else {
+		ptype = super->type;
+	}
+
+	mutex_lock(&persist_types_lock);
+	list_for_each(pos, &persist_types) {
+		ops = list_entry(pos, struct kmem_persist_ops, next);
+		if (ops->type == ptype) {
+			ops->ref_count++;
+			ptype_found = true;
+			break;
+		}
+	}
+	mutex_unlock(&persist_types_lock);
+
+	if (!ptype_found) {
+		dev_err(dev, "No persistence module with type %d\n", ptype);
+		kunmap_local(super);
+		return -EINVAL;
+	}
+
+	if (format) {
+		rc = ops->format(dev_dax);
+		if (rc ||
+			super->magic != kmem_persist_magic ||
+			super->type != persist_format_type
+			) {
+			dev_err(dev,
+				"Error formatting kmem persistence type %d\n",
+				ptype);
+			mutex_lock(&persist_types_lock);
+			ops->ref_count--;
+			mutex_unlock(&persist_types_lock);
+
+			kunmap_local(super);
+			return rc;
+		}
+	}
+
+	kunmap_local(super);
+	super = NULL;
+
+	rc = ops->probe(dev_dax, &data);
+	if (rc) {
+		dev_err(dev, "Error initializing kmem persistence type %d\n",
+			ptype);
+		return rc;
+	}
+	*persist_ops = ops;
+	*persist_data = data;
+	return 0;
+}
+
+int kmem_persist_cleanup(struct dev_dax *dev_dax,
+		struct kmem_persist_ops *pops,
+		void *persist_data)
+{
+	int rc;
+	struct device *dev = &dev_dax->dev;
+
+	rc = pops->cleanup(dev_dax, persist_data);
+
+	if (rc) {
+		dev_err(dev, "Error cleaning up kmem persistence type %d\n",
+			pops->type);
+		return rc;
+	}
+
+	mutex_lock(&persist_types_lock);
+	pops->ref_count--;
+	mutex_unlock(&persist_types_lock);
+
+	return 0;
+}
+
+#endif /* CONFIG_DEV_DAX_KMEM_PERSIST */
+
 static struct dax_device_driver device_dax_kmem_driver = {
 	.probe = dev_dax_kmem_probe,
 	.remove = dev_dax_kmem_remove,
diff --git a/drivers/dax/kmem_persist.h b/drivers/dax/kmem_persist.h
new file mode 100644
index 000000000000..dd651025f28c
--- /dev/null
+++ b/drivers/dax/kmem_persist.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2022 MemVerge. All rights reserved.
+ */
+#ifndef __KMEM_PERSIST_H__
+#define __KMEM_PERSIST_H__
+
+struct page;
+struct dev_dax;
+
+enum kmem_persist_type {
+	KMEM_PERSIST_NONE = 0,
+};
+
+
+struct kmem_persist_ops {
+	enum kmem_persist_type type;
+	int (*format)(struct dev_dax *dev_dax);
+	int (*probe)(struct dev_dax *dev_dax, void **data);
+	int (*cleanup)(struct dev_dax *dev_dax, void *data);
+	int ref_count;
+	struct list_head next;
+};
+
+static const unsigned long kmem_persist_magic = 0x4b4d454d50455253L; // KMEMPERS
+
+struct kmem_persist_superblock {
+	unsigned long magic;
+	enum kmem_persist_type type;
+} __packed;
+
+int kmem_persist_type_register(struct kmem_persist_ops *ops);
+
+int kmem_persist_type_unregister(struct kmem_persist_ops *ops);
+
+
+struct page *dax_kmem_index_to_page(unsigned long page_index,
+				struct dev_dax *dev_dax);
+unsigned long dax_kmem_num_pages(struct dev_dax *dev_dax);
+struct page *dax_kmem_alloc_page(struct dev_dax *dev_dax,
+				unsigned long *page_index);
+
+#endif
-- 
2.30.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux