Hi, This is an RFC patch to provide a dax operation to zero a range of memory. It will also clear poison in the process. This is primarily compile tested patch. I don't have real hardware to test the poison logic. I am posting this to figure out if this is the right direction or not. Motivation from this patch comes from Christoph's feedback that he will rather prefer a dax way to zero a range instead of relying on having to call blkdev_issue_zeroout() in __dax_zero_page_range(). https://lkml.org/lkml/2019/8/26/361 My motivation for this change is virtiofs DAX support. There we use DAX but we don't have a block device. So any dax code which has the assumption that there is always a block device associated is a problem. So this is more of a cleanup of one of the places where dax has this dependency on block device and if we add a dax operation for zeroing a range, it can help with not having to call blkdev_issue_zeroout() in dax path. I have yet to take care of stacked block drivers (dm/md). Current poison clearing logic is primarily written with assumption that I/O is sector aligned. With this new method, this assumption is broken and one can pass any range of memory to zero. I have fixed few places in existing logic to be able to handle an arbitrary start/end. I am not sure are there other dependencies which might need fixing or prohibit us from providing this method. Any feedback or comment is welcome. Thanks Vivek --- drivers/dax/super.c | 13 +++++++++ drivers/nvdimm/pmem.c | 67 ++++++++++++++++++++++++++++++++++++++++++-------- fs/dax.c | 39 ++++++++--------------------- include/linux/dax.h | 3 ++ 4 files changed, 85 insertions(+), 37 deletions(-) Index: rhvgoyal-linux/drivers/nvdimm/pmem.c =================================================================== --- rhvgoyal-linux.orig/drivers/nvdimm/pmem.c 2020-01-23 11:32:11.075139183 -0500 +++ rhvgoyal-linux/drivers/nvdimm/pmem.c 2020-01-23 11:32:28.660139183 -0500 @@ -52,8 +52,8 @@ static void hwpoison_clear(struct pmem_d if (is_vmalloc_addr(pmem->virt_addr)) return; - pfn_start = PHYS_PFN(phys); - pfn_end = pfn_start + PHYS_PFN(len); + pfn_start = PFN_UP(phys); + pfn_end = PFN_DOWN(phys + len); for (pfn = pfn_start; pfn < pfn_end; pfn++) { struct page *page = pfn_to_page(pfn); @@ -71,22 +71,24 @@ static blk_status_t pmem_clear_poison(st phys_addr_t offset, unsigned int len) { struct device *dev = to_dev(pmem); - sector_t sector; + sector_t sector_start, sector_end; long cleared; blk_status_t rc = BLK_STS_OK; + int nr_sectors; - sector = (offset - pmem->data_offset) / 512; + sector_start = ALIGN((offset - pmem->data_offset), 512) / 512; + sector_end = ALIGN_DOWN((offset - pmem->data_offset + len), 512)/512; + nr_sectors = sector_end - sector_start; cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); if (cleared < len) rc = BLK_STS_IOERR; - if (cleared > 0 && cleared / 512) { + if (cleared > 0 && nr_sectors > 0) { hwpoison_clear(pmem, pmem->phys_addr + offset, cleared); - cleared /= 512; - dev_dbg(dev, "%#llx clear %ld sector%s\n", - (unsigned long long) sector, cleared, - cleared > 1 ? "s" : ""); - badblocks_clear(&pmem->bb, sector, cleared); + dev_dbg(dev, "%#llx clear %d sector%s\n", + (unsigned long long) sector_start, nr_sectors, + nr_sectors > 1 ? "s" : ""); + badblocks_clear(&pmem->bb, sector_start, nr_sectors); if (pmem->bb_state) sysfs_notify_dirent(pmem->bb_state); } @@ -268,6 +270,50 @@ static const struct block_device_operati .revalidate_disk = nvdimm_revalidate_disk, }; +static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + unsigned int offset, loff_t len) +{ + int rc = 0; + phys_addr_t phys_pos = pgoff * PAGE_SIZE + offset; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct page *page = ZERO_PAGE(0); + + do { + unsigned bytes, nr_sectors = 0; + sector_t sector_start, sector_end; + bool bad_pmem = false; + phys_addr_t pmem_off = phys_pos + pmem->data_offset; + void *pmem_addr = pmem->virt_addr + pmem_off; + unsigned int page_offset; + + page_offset = offset_in_page(phys_pos); + bytes = min_t(loff_t, PAGE_SIZE - page_offset, len); + + sector_start = ALIGN(phys_pos, 512)/512; + sector_end = ALIGN_DOWN(phys_pos + bytes, 512)/512; + if (sector_end > sector_start) + nr_sectors = sector_end - sector_start; + + if (nr_sectors && + unlikely(is_bad_pmem(&pmem->bb, sector_start, + nr_sectors * 512))) + bad_pmem = true; + + write_pmem(pmem_addr, page, 0, bytes); + if (unlikely(bad_pmem)) { + rc = pmem_clear_poison(pmem, pmem_off, bytes); + write_pmem(pmem_addr, page, 0, bytes); + } + if (rc > 0) + return -EIO; + + phys_pos += phys_pos + bytes; + len -= bytes; + } while (len > 0); + + return 0; +} + static long pmem_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -299,6 +345,7 @@ static const struct dax_operations pmem_ .dax_supported = generic_fsdax_supported, .copy_from_iter = pmem_copy_from_iter, .copy_to_iter = pmem_copy_to_iter, + .zero_page_range = pmem_dax_zero_page_range, }; static const struct attribute_group *pmem_attribute_groups[] = { Index: rhvgoyal-linux/include/linux/dax.h =================================================================== --- rhvgoyal-linux.orig/include/linux/dax.h 2020-01-23 11:25:23.814139183 -0500 +++ rhvgoyal-linux/include/linux/dax.h 2020-01-23 11:32:17.799139183 -0500 @@ -34,6 +34,8 @@ struct dax_operations { /* copy_to_iter: required operation for fs-dax direct-i/o */ size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* zero_page_range: optional operation for fs-dax direct-i/o */ + int (*zero_page_range)(struct dax_device *, pgoff_t, unsigned, loff_t); }; extern struct attribute_group dax_attribute_group; @@ -209,6 +211,7 @@ size_t dax_copy_from_iter(struct dax_dev size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, unsigned offset, loff_t len); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, Index: rhvgoyal-linux/fs/dax.c =================================================================== --- rhvgoyal-linux.orig/fs/dax.c 2020-01-23 11:25:23.814139183 -0500 +++ rhvgoyal-linux/fs/dax.c 2020-01-23 11:32:17.801139183 -0500 @@ -1044,38 +1044,23 @@ static vm_fault_t dax_load_hole(struct x return ret; } -static bool dax_range_is_aligned(struct block_device *bdev, - unsigned int offset, unsigned int length) -{ - unsigned short sector_size = bdev_logical_block_size(bdev); - - if (!IS_ALIGNED(offset, sector_size)) - return false; - if (!IS_ALIGNED(length, sector_size)) - return false; - - return true; -} - int __dax_zero_page_range(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, unsigned int offset, unsigned int size) { - if (dax_range_is_aligned(bdev, offset, size)) { - sector_t start_sector = sector + (offset >> 9); + pgoff_t pgoff; + long rc, id; - return blkdev_issue_zeroout(bdev, start_sector, - size >> 9, GFP_NOFS, 0); - } else { - pgoff_t pgoff; - long rc, id; + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_zero_page_range(dax_dev, pgoff, offset, size); + if (rc == -EOPNOTSUPP) { void *kaddr; - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; - - id = dax_read_lock(); + /* If driver does not implement zero page range, fallback */ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); @@ -1083,9 +1068,9 @@ int __dax_zero_page_range(struct block_d } memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); - dax_read_unlock(id); } - return 0; + dax_read_unlock(id); + return rc; } EXPORT_SYMBOL_GPL(__dax_zero_page_range); Index: rhvgoyal-linux/drivers/dax/super.c =================================================================== --- rhvgoyal-linux.orig/drivers/dax/super.c 2020-01-23 11:25:23.814139183 -0500 +++ rhvgoyal-linux/drivers/dax/super.c 2020-01-23 11:32:17.802139183 -0500 @@ -344,6 +344,19 @@ size_t dax_copy_to_iter(struct dax_devic } EXPORT_SYMBOL_GPL(dax_copy_to_iter); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + unsigned offset, loff_t len) +{ + if (!dax_alive(dax_dev)) + return 0; + + if (!dax_dev->ops->zero_page_range) + return -EOPNOTSUPP; + + return dax_dev->ops->zero_page_range(dax_dev, pgoff, offset, len); +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)