[PATCH v2 2/2] dax,pmem: Implement pmem based dax data recovery

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for
dax_direct_access to translate 'kaddr' over a range that
may contain poison(s); and enable dax_copy_to_iter to
read as much data as possible up till a poisoned page is
encountered; and enable dax_copy_from_iter to clear poison
among a page-aligned range, and then write the good data over.

Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx>
---
 drivers/md/dm.c       |  2 ++
 drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++---
 fs/dax.c              | 24 +++++++++++---
 3 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dc354db22ef9..9b3dac916f22 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_from_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_from_iter(addr, bytes, i);
 		goto out;
 	}
@@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_to_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_to_iter(addr, bytes, i);
 		goto out;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3dc99e0bf633..8ae6aa678c51 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
-					PFN_PHYS(nr_pages))))
+				 PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL))
 		return -EIO;
 
 	if (kaddr)
@@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
 }
 
 /*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * But on a data recovery code path, the 'read'/'write' range is expected
+ * to contain poison(s), and so poison(s) is explicit checked, such that
+ * 'read' can fetch data from clean page(s) up till the first poison is
+ * encountered, and 'write' requires the range be page aligned in order
+ * to restore the poisoned page's memory type back to "rw" after clearing
+ * the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
+ *
+ * TODO: add support for CPUs that support MOVDIR64B instruction for
+ * faster poison clearing, and possibly smaller error blast radius.
  */
 static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	phys_addr_t pmem_off;
+	size_t len, lead_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+			if (lead_off || !(PAGE_ALIGNED(bytes))) {
+				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+					addr, bytes);
+				return (size_t) -EIO;
+			}
+			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+						BLK_STS_OK)
+				return (size_t) -EIO;
+		}
+	}
+
 	return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	int num_bad;
+	size_t len, lead_off;
+	unsigned long bad_pfn;
+	bool bad_pmem = false;
+	size_t adj_len = bytes;
+	sector_t sector, first_bad;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		sector = PFN_PHYS(pgoff) / 512;
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (pmem->bb.count)
+			bad_pmem = !!badblocks_check(&pmem->bb, sector,
+					len / 512, &first_bad, &num_bad);
+		if (bad_pmem) {
+			bad_pfn = PHYS_PFN(first_bad * 512);
+			if (bad_pfn == pgoff) {
+				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+					pgoff);
+				return -EIO;
+			}
+			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+			dev_WARN_ONCE(dev, (adj_len > bytes),
+					"out-of-range first_bad?");
+		}
+		if (adj_len == 0)
+			return (size_t) -EIO;
+	}
+
 	return _copy_mc_to_iter(addr, bytes, i);
 }
 
diff --git a/fs/dax.c b/fs/dax.c
index bea6df1498c3..7640be6b6a97 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1219,6 +1219,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		const size_t size = ALIGN(length + offset, PAGE_SIZE);
 		const sector_t sector = dax_iomap_sector(iomap, pos);
+		long nr_page = PHYS_PFN(size);
+		int dax_mode = DAX_OP_NORMAL;
 		ssize_t map_len;
 		pgoff_t pgoff;
 		void *kaddr;
@@ -1232,8 +1234,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		if (ret)
 			break;
 
-		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
-					    DAX_OP_NORMAL, &kaddr, NULL);
+		map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode,
+					    &kaddr, NULL);
+		if (unlikely(map_len == -EIO)) {
+			dax_mode = DAX_OP_RECOVERY;
+			map_len = dax_direct_access(dax_dev, pgoff, nr_page,
+						    dax_mode, &kaddr, NULL);
+		}
 		if (map_len < 0) {
 			ret = map_len;
 			break;
@@ -1252,11 +1259,20 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		 */
 		if (iov_iter_rw(iter) == WRITE)
 			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 		else
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 
+		/*
+		 * If dax data recovery is enacted via DAX_OP_RECOVERY,
+		 * recovery failure would be indicated by a -EIO return
+		 * in 'xfer' casted as (size_t).
+		 */
+		if ((ssize_t)xfer == -EIO) {
+			ret = -EIO;
+			break;
+		}
 		pos += xfer;
 		length -= xfer;
 		done += xfer;
-- 
2.18.4




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux