pmem_recovery_write() consists of clearing poison via DSM, clearing page HWPoison bit, re-state _PAGE_PRESENT bit, cacheflush, write, and finally clearing bad-block record. A competing pread thread is held off during recovery write by the presence of bad-block record. A competing recovery_write thread is serialized by a lock. Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx> --- drivers/nvdimm/pmem.c | 84 +++++++++++++++++++++++++++++++++++++++---- drivers/nvdimm/pmem.h | 1 + 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index a68e7d3ed27e..dd2db4905c85 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, } } +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, + long cleared_blks) +{ + badblocks_clear(&pmem->bb, sector, cleared_blks); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); +} + static blk_status_t pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, dev_dbg(dev, "%#llx clear %ld sector%s\n", (unsigned long long) sector, cleared, cleared > 1 ? "s" : ""); - badblocks_clear(&pmem->bb, sector, cleared); - if (pmem->bb_state) - sysfs_notify_dirent(pmem->bb_state); + pmem_clear_badblocks(pmem, sector, cleared); } arch_invalidate_pmem(pmem->virt_addr + offset, len); @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { + bool bad_pmem; + bool do_recovery = false; resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, - PFN_PHYS(nr_pages)))) + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)); + if (bad_pmem && kaddr) + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); + if (bad_pmem && !do_recovery) return -EIO; if (kaddr) @@ -319,10 +330,70 @@ static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, return _copy_mc_to_iter(addr, bytes, i); } +/* + * The recovery write thread started out as a normal pwrite thread and + * when the filesystem was told about potential media error in the + * range, filesystem turns the normal pwrite to a dax_recovery_write. + * + * The recovery write consists of clearing poison via DSM, clearing page + * HWPoison bit, reenable page-wide read-write permission, flush the + * caches and finally write. A competing pread thread needs to be held + * off during the recovery process since data read back might not be valid. + * And that's achieved by placing the badblock records clearing after + * the completion of the recovery write. + * + * Any competing recovery write thread needs to be serialized, and this is + * done via pmem device level lock .recovery_lock. + */ static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return 0; + size_t rc, len, off; + phys_addr_t pmem_off; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + sector_t sector; + long cleared, cleared_blk; + + mutex_lock(&pmem->recovery_lock); + + /* If no poison found in the range, go ahead with write */ + off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(off + bytes)); + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { + rc = _copy_from_iter_flushcache(addr, bytes, i); + goto write_done; + } + + /* Not page-aligned range cannot be recovered */ + if (off || !(PAGE_ALIGNED(bytes))) { + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", + addr, bytes); + rc = (size_t) -EIO; + goto write_done; + } + + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + sector = (pmem_off - pmem->data_offset) / 512; + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); + cleared_blk = cleared / 512; + if (cleared_blk > 0) { + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); + } else { + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", + cleared_blk); + rc = (size_t) -EIO; + goto write_done; + } + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); + + rc = _copy_from_iter_flushcache(addr, bytes, i); + + pmem_clear_badblocks(pmem, sector, cleared_blk); + +write_done: + mutex_unlock(&pmem->recovery_lock); + return rc; } static const struct dax_operations pmem_dax_ops = { @@ -514,6 +585,7 @@ static int pmem_attach_disk(struct device *dev, goto out_cleanup_dax; dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); set_dax_recovery(dax_dev); + mutex_init(&pmem->recovery_lock); pmem->dax_dev = dax_dev; rc = device_add_disk(dev, disk, pmem_attribute_groups); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..971bff7552d6 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -24,6 +24,7 @@ struct pmem_device { struct dax_device *dax_dev; struct gendisk *disk; struct dev_pagemap pgmap; + struct mutex recovery_lock; }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, -- 2.18.4