Mathieu Desnoyers wrote: [..] > The change for -EOPNOTSUPP in header and implementation would look like > this (more questions below): > > diff --git a/include/linux/dax.h b/include/linux/dax.h > index b463502b16e1..df2d52b8a245 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -86,11 +86,7 @@ static inline void *dax_holder(struct dax_device *dax_dev) > static inline struct dax_device *alloc_dax(void *private, > const struct dax_operations *ops) > { > - /* > - * Callers should check IS_ENABLED(CONFIG_DAX) to know if this > - * NULL is an error or expected. > - */ > - return NULL; > + return ERR_PTR(-EOPNOTSUPP); > } > static inline void put_dax(struct dax_device *dax_dev) > { > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index 8c3a6e8e6334..c1cf6f0bbe12 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -448,7 +448,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) > > /* Unavailable on architectures with virtually aliased data caches. */ > if (cpu_dcache_is_aliasing()) > - return NULL; > + return ERR_PTR(-EOPNOTSUPP); > > if (WARN_ON_ONCE(ops && !ops->zero_page_range)) > return ERR_PTR(-EINVAL); > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > index f4b635526345..254d3b1e420e 100644 > > --- a/drivers/dax/super.c > > +++ b/drivers/dax/super.c > > @@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(dax_alive); > > */ > > void kill_dax(struct dax_device *dax_dev) > > { > > - if (!dax_dev) > > + if (IS_ERR_OR_NULL(dax_dev)) > > I am tempted to just leave the "if (!dax_dev)" check here, because > many other functions of this API are only protected by a NULL > pointer check. I would hate to forget to convert one check in this > change, and I don't think it simplifies anything. It's not that it simplifies anything, but mistakes fail safely as a memory leak rather than a crash. > The alternative route I intend to take is to audit all callers > of alloc_dax() and make sure they all save the alloc_dax() return > value in a struct dax_device * local variable first for the sake > of checking for IS_ERR(). This will leave the xyz->dax_dev pointer > initialized to NULL in the error case and simplify the rest of > error checking. I could maybe get on board with that, but it needs a comment somewhere about the asymmetric subtlety. > > > > return; > > > > if (dax_dev->holder_data != NULL) > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > > index 4e8fdcb3f1c8..b69c9e442cf4 100644 > > --- a/drivers/nvdimm/pmem.c > > +++ b/drivers/nvdimm/pmem.c > > @@ -560,17 +560,19 @@ static int pmem_attach_disk(struct device *dev, > > dax_dev = alloc_dax(pmem, &pmem_dax_ops); > > if (IS_ERR(dax_dev)) { > > rc = PTR_ERR(dax_dev); > > - goto out; > > + if (rc != -EOPNOTSUPP) > > + goto out; > > If I compare the before / after this change, if previously > pmem_attach_disk() was called in a configuration with FS_DAX=n, it would > result in a NULL pointer dereference. No, alloc_dax() only returns NULL CONFIG_DAX=n case, not the CONFIG_FS_DAX=n case. So that means that pmem devices on ARM have been possible without FS_DAX. So, in order for alloc_dax() returning ERR_PTR(-EOPNOTSUPP) to not regress pmem device availability this error path needs to be changed. > This would be an error handling fix all by itself. Do we really want > to return successfully if dax is unsupported, or should we return > an error here ? Per above, there is no error handling fix, and pmem block device available should not depend on alloc_dax() succeeding. The real question is what to do about device-dax. I *think* it is not affected by cpu_dcache aliasing because it never accesses user mappings through a kernel alias. I doubt device-dax is in use on these platforms, but we might need another fixup for that if someone screams about the alloc_dax() behavior change making them lose device-dax access. > > + } else { > > + set_dax_nocache(dax_dev); > > + set_dax_nomc(dax_dev); > > + if (is_nvdimm_sync(nd_region)) > > + set_dax_synchronous(dax_dev); > > + rc = dax_add_host(dax_dev, disk); > > + if (rc) > > + goto out_cleanup_dax; > > + dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); > > + pmem->dax_dev = dax_dev; > > } > > - set_dax_nocache(dax_dev); > > - set_dax_nomc(dax_dev); > > - if (is_nvdimm_sync(nd_region)) > > - set_dax_synchronous(dax_dev); > > - rc = dax_add_host(dax_dev, disk); > > - if (rc) > > - goto out_cleanup_dax; > > - dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); > > - pmem->dax_dev = dax_dev; > > > > rc = device_add_disk(dev, disk, pmem_attribute_groups); > > if (rc) > > diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c > > index 4b7ecd4fd431..f911e58a24dd 100644 > > --- a/drivers/s390/block/dcssblk.c > > +++ b/drivers/s390/block/dcssblk.c > > @@ -681,12 +681,14 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char > > if (IS_ERR(dev_info->dax_dev)) { > > rc = PTR_ERR(dev_info->dax_dev); > > dev_info->dax_dev = NULL; > > - goto put_dev; > > + if (rc != -EOPNOTSUPP) > > + goto put_dev; > > config DCSSBLK selects FS_DAX_LIMITED and DAX. > > I'm not sure what selecting DAX is trying to achieve here, because the > Kconfig option is "FS_DAX". > > So depending on the real motivation behind this select, we may want to > consider failure rather than success in the -EOPNOTSUPP case. Ah, true, s390 is a !cpu_dcache_is_aliasing() arch, so it is ok to fail driver load on alloc_dax() failure as it knows that ERR_PTR(-EOPNOTSUPP) will never be returned. > > > + } else { > > + set_dax_synchronous(dev_info->dax_dev); > > + rc = dax_add_host(dev_info->dax_dev, dev_info->gd); > > + if (rc) > > + goto out_dax; > > } > > - set_dax_synchronous(dev_info->dax_dev); > > - rc = dax_add_host(dev_info->dax_dev, dev_info->gd); > > - if (rc) > > - goto out_dax; > > > > get_device(&dev_info->dev); > > rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); > > My own changes, if we want failure on -EOPNOTSUPP: > > diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c > index 4b7ecd4fd431..f363c1d51d9a 100644 > --- a/drivers/s390/block/dcssblk.c > +++ b/drivers/s390/block/dcssblk.c > @@ -549,6 +549,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char > int rc, i, j, num_of_segments; > struct dcssblk_dev_info *dev_info; > struct segment_info *seg_info, *temp; > + struct dax_device *dax_dev; > char *local_buf; > unsigned long seg_byte_size; > > @@ -677,13 +678,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char > if (rc) > goto put_dev; > > - dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); > - if (IS_ERR(dev_info->dax_dev)) { > - rc = PTR_ERR(dev_info->dax_dev); > - dev_info->dax_dev = NULL; > + dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); > + if (IS_ERR(dax_dev)) { > + rc = PTR_ERR(dax_dev); > goto put_dev; > } > - set_dax_synchronous(dev_info->dax_dev); > + set_dax_synchronous(dax_dev); > + dev_info->dax_dev = dax_dev; Looks good to me. > rc = dax_add_host(dev_info->dax_dev, dev_info->gd); > if (rc) > goto out_dax; > > > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > > index 5f1be1da92ce..11053a70f5ab 100644 > > --- a/fs/fuse/virtio_fs.c > > +++ b/fs/fuse/virtio_fs.c > > @@ -16,6 +16,7 @@ > > #include <linux/fs_context.h> > > #include <linux/fs_parser.h> > > #include <linux/highmem.h> > > +#include <linux/cleanup.h> > > #include <linux/uio.h> > > #include "fuse_i.h" > > > > @@ -795,8 +796,11 @@ static void virtio_fs_cleanup_dax(void *data) > > put_dax(dax_dev); > > } > > > > +DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T)) > > + > > static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) > > So either I'm completely missing how ownership works in this function, or > we should be really concerned about the fact that it does no actual > cleanup of anything on any error. > > I would be tempted to first refactor this function without using cleanup.h > so those fixes can be easily backported to older kernels (?) > > Here what I'm seeing so far: > > - devm_release_mem_region() is never called after devm_request_mem_region(). Not > on error, neither on teardown, devm_release_mem_region() is called from virtio_fs_probe() context. That means that when virtio_fs_probe() returns an error the driver core will automatically call devm_request_mem_region(). > - pgmap is never freed on error after devm_kzalloc. That is what the "devm_" in devm_kzalloc() does, free the memory on driver-probe failure, or after the driver remove callback is invoked. > > > { > > + struct dax_device *dax_dev __free(cleanup_dax) = NULL; > > struct virtio_shm_region cache_reg; > > struct dev_pagemap *pgmap; > > bool have_cache; > > @@ -804,6 +808,15 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) > > if (!IS_ENABLED(CONFIG_FUSE_DAX)) > > return 0; > > > > + dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); > > + if (IS_ERR(dax_dev)) { > > + int rc = PTR_ERR(dax_dev); > > + > > + if (rc == -EOPNOTSUPP) > > + return 0; > > + return rc; > > + } > > What is gained by moving this allocation here ? The gain is to fail early in virtio_fs_setup_dax() since the fundamental dependency of alloc_dax() success is not met. For example why let the setup progress to devm_memremap_pages() when alloc_dax() is going to return ERR_PTR(-EOPNOTSUPP). > > > + > > /* Get cache region */ > > have_cache = virtio_get_shm_region(vdev, &cache_reg, > > (u8)VIRTIO_FS_SHMCAP_ID_CACHE); > > @@ -849,10 +862,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) > > dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", > > __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); > > > > - fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); > > - if (IS_ERR(fs->dax_dev)) > > - return PTR_ERR(fs->dax_dev); > > - > > + fs->dax_dev = no_free_ptr(dax_dev); > > return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, > > fs->dax_dev); > > } > > In addition I have: > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index f90743a94da9..86de91b35f4d 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c > @@ -2054,6 +2054,7 @@ static void cleanup_mapped_device(struct mapped_device *md) > static struct mapped_device *alloc_dev(int minor) > { > int r, numa_node_id = dm_get_numa_node(); > + struct dax_device *dax_dev; > struct mapped_device *md; > void *old_md; > > @@ -2122,16 +2123,13 @@ static struct mapped_device *alloc_dev(int minor) > md->disk->private_data = md; > sprintf(md->disk->disk_name, "dm-%d", minor); > > - if (IS_ENABLED(CONFIG_FS_DAX)) { > - md->dax_dev = alloc_dax(md, &dm_dax_ops); > - if (IS_ERR(md->dax_dev)) { > - md->dax_dev = NULL; > - } else { > - set_dax_nocache(md->dax_dev); > - set_dax_nomc(md->dax_dev); > - if (dax_add_host(md->dax_dev, md->disk)) > - goto bad; > - } > + dax_dev = alloc_dax(md, &dm_dax_ops); > + if (!IS_ERR(dax_dev)) { > + set_dax_nocache(dax_dev); > + set_dax_nomc(dax_dev); > + md->dax_dev = dax_dev; > + if (dax_add_host(dax_dev, md->disk)) > + goto bad; Looks good.