Add the actual driver to which will own the DAX range. This allows very nice party with the other possible "owners" of a DAX region: device DAX and filesystem DAX. It also greatly simplifies the process of handing off control of the memory between the different owners since it's just a matter of unbinding and rebinding the device to different drivers. I tried to do this all internally to the kernel and the locking and "self-destruction" of the old device context was a nightmare. Having userspace drive it is a wonderful simplification. Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: Dave Jiang <dave.jiang@xxxxxxxxx> Cc: Ross Zwisler <zwisler@xxxxxxxxxx> Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx> Cc: Tom Lendacky <thomas.lendacky@xxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: linux-nvdimm@xxxxxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx Cc: linux-mm@xxxxxxxxx Cc: Huang Ying <ying.huang@xxxxxxxxx> Cc: Fengguang Wu <fengguang.wu@xxxxxxxxx> --- b/drivers/dax/kmem.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff -puN /dev/null drivers/dax/kmem.c --- /dev/null 2018-09-18 12:39:53.059362935 -0700 +++ b/drivers/dax/kmem.c 2018-10-22 13:12:21.502930393 -0700 @@ -0,0 +1,152 @@ +// this just just a copy of drivers/dax/pmem.c with +// s/dax_pmem/dax_kmem' for now. +// +// need real license +/* + * Copyright(c) 2016-2018 Intel Corporation. All rights reserved. + */ +#include <linux/percpu-refcount.h> +#include <linux/memremap.h> +#include <linux/module.h> +#include <linux/pfn_t.h> +#include "../nvdimm/pfn.h" +#include "../nvdimm/nd.h" +#include "device-dax.h" + +struct dax_kmem { + struct device *dev; + struct percpu_ref ref; + struct dev_pagemap pgmap; + struct completion cmp; +}; + +static struct dax_kmem *to_dax_kmem(struct percpu_ref *ref) +{ + return container_of(ref, struct dax_kmem, ref); +} + +static void dax_kmem_percpu_release(struct percpu_ref *ref) +{ + struct dax_kmem *dax_kmem = to_dax_pmem(ref); + + dev_dbg(dax_kmem->dev, "trace\n"); + complete(&dax_kmem->cmp); +} + +static void dax_kmem_percpu_exit(void *data) +{ + struct percpu_ref *ref = data; + struct dax_kmem *dax_kmem = to_dax_pmem(ref); + + dev_dbg(dax_kmem->dev, "trace\n"); + wait_for_completion(&dax_kmem->cmp); + percpu_ref_exit(ref); +} + +static void dax_kmem_percpu_kill(void *data) +{ + struct percpu_ref *ref = data; + struct dax_kmem *dax_kmem = to_dax_pmem(ref); + + dev_dbg(dax_kmem->dev, "trace\n"); + percpu_ref_kill(ref); +} + +static int dax_kmem_probe(struct device *dev) +{ + void *addr; + struct resource res; + int rc, id, region_id; + struct nd_pfn_sb *pfn_sb; + struct dev_dax *dev_dax; + struct dax_kmem *dax_kmem; + struct nd_namespace_io *nsio; + struct dax_region *dax_region; + struct nd_namespace_common *ndns; + struct nd_dax *nd_dax = to_nd_dax(dev); + struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + nsio = to_nd_namespace_io(&ndns->dev); + + dax_kmem = devm_kzalloc(dev, sizeof(*dax_kmem), GFP_KERNEL); + if (!dax_kmem) + return -ENOMEM; + + /* parse the 'pfn' info block via ->rw_bytes */ + rc = devm_nsio_enable(dev, nsio); + if (rc) + return rc; + rc = nvdimm_setup_pfn(nd_pfn, &dax_kmem->pgmap); + if (rc) + return rc; + devm_nsio_disable(dev, nsio); + + pfn_sb = nd_pfn->pfn_sb; + + if (!devm_request_mem_region(dev, nsio->res.start, + resource_size(&nsio->res), + dev_name(&ndns->dev))) { + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); + return -EBUSY; + } + + dax_kmem->dev = dev; + init_completion(&dax_kmem->cmp); + rc = percpu_ref_init(&dax_kmem->ref, dax_kmem_percpu_release, 0, + GFP_KERNEL); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, dax_kmem_percpu_exit, + &dax_kmem->ref); + if (rc) + return rc; + + dax_kmem->pgmap.ref = &dax_kmem->ref; + addr = devm_memremap_pages(dev, &dax_kmem->pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + rc = devm_add_action_or_reset(dev, dax_kmem_percpu_kill, + &dax_kmem->ref); + if (rc) + return rc; + + /* adjust the dax_region resource to the start of data */ + memcpy(&res, &dax_kmem->pgmap.res, sizeof(res)); + res.start += le64_to_cpu(pfn_sb->dataoff); + + rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); + if (rc != 2) + return -EINVAL; + + dax_region = alloc_dax_region(dev, region_id, &res, + le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP); + if (!dax_region) + return -ENOMEM; + + /* TODO: support for subdividing a dax region... */ + dev_dax = devm_create_dev_dax(dax_region, id, &res, 1); + + /* child dev_dax instances now own the lifetime of the dax_region */ + dax_region_put(dax_region); + + return PTR_ERR_OR_ZERO(dev_dax); +} + +static struct nd_device_driver dax_kmem_driver = { + .probe = dax_kmem_probe, + .drv = { + .name = "dax_kmem", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +module_nd_driver(dax_kmem_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); _