For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for dax_direct_access to translate 'kaddr' over a range that may contain poison(s); and enable dax_copy_to_iter to read as much data as possible up till a poisoned page is encountered; and enable dax_copy_from_iter to clear poison among a page-aligned range, and then write the good data over. Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx> --- drivers/md/dm.c | 2 ++ drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++--- fs/dax.c | 24 +++++++++++--- 3 files changed, 92 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dc354db22ef9..9b3dac916f22 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, if (!ti) goto out; if (!ti->type->dax_copy_from_iter) { + WARN_ON(mode == DAX_OP_RECOVERY); ret = copy_from_iter(addr, bytes, i); goto out; } @@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, if (!ti) goto out; if (!ti->type->dax_copy_to_iter) { + WARN_ON(mode == DAX_OP_RECOVERY); ret = copy_to_iter(addr, bytes, i); goto out; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 3dc99e0bf633..8ae6aa678c51 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, - PFN_PHYS(nr_pages)))) + PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL)) return -EIO; if (kaddr) @@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, } /* - * Use the 'no check' versions of copy_from_iter_flushcache() and - * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds - * checking, both file offset and device offset, is handled by - * dax_iomap_actor() + * Even though the 'no check' versions of copy_from_iter_flushcache() + * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead, + * 'read'/'write' aren't always safe when poison is consumed. They happen + * to be safe because the 'read'/'write' range has been guaranteed + * be free of poison(s) by a prior call to dax_direct_access() on the + * caller stack. + * But on a data recovery code path, the 'read'/'write' range is expected + * to contain poison(s), and so poison(s) is explicit checked, such that + * 'read' can fetch data from clean page(s) up till the first poison is + * encountered, and 'write' requires the range be page aligned in order + * to restore the poisoned page's memory type back to "rw" after clearing + * the poison(s). + * In the event of poison related failure, (size_t) -EIO is returned and + * caller may check the return value after casting it to (ssize_t). + * + * TODO: add support for CPUs that support MOVDIR64B instruction for + * faster poison clearing, and possibly smaller error blast radius. */ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i, int mode) { + phys_addr_t pmem_off; + size_t len, lead_off; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + + if (unlikely(mode == DAX_OP_RECOVERY)) { + lead_off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(lead_off + bytes)); + if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { + if (lead_off || !(PAGE_ALIGNED(bytes))) { + dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n", + addr, bytes); + return (size_t) -EIO; + } + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + if (pmem_clear_poison(pmem, pmem_off, bytes) != + BLK_STS_OK) + return (size_t) -EIO; + } + } + return _copy_from_iter_flushcache(addr, bytes, i); } static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i, int mode) { + int num_bad; + size_t len, lead_off; + unsigned long bad_pfn; + bool bad_pmem = false; + size_t adj_len = bytes; + sector_t sector, first_bad; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + + if (unlikely(mode == DAX_OP_RECOVERY)) { + sector = PFN_PHYS(pgoff) / 512; + lead_off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(lead_off + bytes)); + if (pmem->bb.count) + bad_pmem = !!badblocks_check(&pmem->bb, sector, + len / 512, &first_bad, &num_bad); + if (bad_pmem) { + bad_pfn = PHYS_PFN(first_bad * 512); + if (bad_pfn == pgoff) { + dev_warn(dev, "Found poison in page: pgoff(%#lx)\n", + pgoff); + return -EIO; + } + adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off; + dev_WARN_ONCE(dev, (adj_len > bytes), + "out-of-range first_bad?"); + } + if (adj_len == 0) + return (size_t) -EIO; + } + return _copy_mc_to_iter(addr, bytes, i); } diff --git a/fs/dax.c b/fs/dax.c index bea6df1498c3..7640be6b6a97 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1219,6 +1219,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); const sector_t sector = dax_iomap_sector(iomap, pos); + long nr_page = PHYS_PFN(size); + int dax_mode = DAX_OP_NORMAL; ssize_t map_len; pgoff_t pgoff; void *kaddr; @@ -1232,8 +1234,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (ret) break; - map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - DAX_OP_NORMAL, &kaddr, NULL); + map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode, + &kaddr, NULL); + if (unlikely(map_len == -EIO)) { + dax_mode = DAX_OP_RECOVERY; + map_len = dax_direct_access(dax_dev, pgoff, nr_page, + dax_mode, &kaddr, NULL); + } if (map_len < 0) { ret = map_len; break; @@ -1252,11 +1259,20 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, */ if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, - map_len, iter, DAX_OP_NORMAL); + map_len, iter, dax_mode); else xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, - map_len, iter, DAX_OP_NORMAL); + map_len, iter, dax_mode); + /* + * If dax data recovery is enacted via DAX_OP_RECOVERY, + * recovery failure would be indicated by a -EIO return + * in 'xfer' casted as (size_t). + */ + if ((ssize_t)xfer == -EIO) { + ret = -EIO; + break; + } pos += xfer; length -= xfer; done += xfer; -- 2.18.4