On 11/5/2021 7:04 PM, Darrick J. Wong wrote: <snip> >> @@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, >> } >> >> /* >> - * Use the 'no check' versions of copy_from_iter_flushcache() and >> - * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds >> - * checking, both file offset and device offset, is handled by >> - * dax_iomap_actor() >> + * Even though the 'no check' versions of copy_from_iter_flushcache() >> + * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead, >> + * 'read'/'write' aren't always safe when poison is consumed. They happen >> + * to be safe because the 'read'/'write' range has been guaranteed >> + * be free of poison(s) by a prior call to dax_direct_access() on the >> + * caller stack. >> + * But on a data recovery code path, the 'read'/'write' range is expected >> + * to contain poison(s), and so poison(s) is explicit checked, such that >> + * 'read' can fetch data from clean page(s) up till the first poison is >> + * encountered, and 'write' requires the range be page aligned in order >> + * to restore the poisoned page's memory type back to "rw" after clearing >> + * the poison(s). >> + * In the event of poison related failure, (size_t) -EIO is returned and >> + * caller may check the return value after casting it to (ssize_t). >> + * >> + * TODO: add support for CPUs that support MOVDIR64B instruction for >> + * faster poison clearing, and possibly smaller error blast radius. > > I get that it's still early days yet for whatever pmem stuff is going on > for 5.17, but I feel like this ought to be a separate function called by > pmem_copy_from_iter, with this huge comment attached to that recovery > function. Thanks, will refactor both functions. > >> */ >> static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, >> void *addr, size_t bytes, struct iov_iter *i, int mode) >> { >> + phys_addr_t pmem_off; >> + size_t len, lead_off; >> + struct pmem_device *pmem = dax_get_private(dax_dev); >> + struct device *dev = pmem->bb.dev; >> + >> + if (unlikely(mode == DAX_OP_RECOVERY)) { >> + lead_off = (unsigned long)addr & ~PAGE_MASK; >> + len = PFN_PHYS(PFN_UP(lead_off + bytes)); >> + if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { >> + if (lead_off || !(PAGE_ALIGNED(bytes))) { >> + dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n", >> + addr, bytes); >> + return (size_t) -EIO; >> + } >> + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; >> + if (pmem_clear_poison(pmem, pmem_off, bytes) != >> + BLK_STS_OK) >> + return (size_t) -EIO; > > Looks reasonable enough to me, though you might want to restructure this > to reduce the amount of indent. Agreed. > > FWIW I dislike how is_bad_pmem mixes units (sector_t vs. bytes), that > was seriously confusing. But I guess that's a weird quirk of the > badblocks API and .... ugh. > > (I dunno, can we at least clean up the nvdimm parts and some day replace > the badblocks backend with something that can handle more than 16 > records? interval_tree is more than up to that task, I know, I use it > for xfs online fsck...) Let me look into this and get back to you. > >> + } >> + } >> + >> return _copy_from_iter_flushcache(addr, bytes, i); >> } >> >> static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, >> void *addr, size_t bytes, struct iov_iter *i, int mode) >> { >> + int num_bad; >> + size_t len, lead_off; >> + unsigned long bad_pfn; >> + bool bad_pmem = false; >> + size_t adj_len = bytes; >> + sector_t sector, first_bad; >> + struct pmem_device *pmem = dax_get_private(dax_dev); >> + struct device *dev = pmem->bb.dev; >> + >> + if (unlikely(mode == DAX_OP_RECOVERY)) { >> + sector = PFN_PHYS(pgoff) / 512; >> + lead_off = (unsigned long)addr & ~PAGE_MASK; >> + len = PFN_PHYS(PFN_UP(lead_off + bytes)); >> + if (pmem->bb.count) >> + bad_pmem = !!badblocks_check(&pmem->bb, sector, >> + len / 512, &first_bad, &num_bad); >> + if (bad_pmem) { >> + bad_pfn = PHYS_PFN(first_bad * 512); >> + if (bad_pfn == pgoff) { >> + dev_warn(dev, "Found poison in page: pgoff(%#lx)\n", >> + pgoff); >> + return -EIO; >> + } >> + adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off; >> + dev_WARN_ONCE(dev, (adj_len > bytes), >> + "out-of-range first_bad?"); >> + } >> + if (adj_len == 0) >> + return (size_t) -EIO; > > Uh, are we supposed to adjust bytes here or something? Because we're trying to read as much data as possible... What is your concern here? thanks! -jane > > --D > >> + } >> + >> return _copy_mc_to_iter(addr, bytes, i); >> } >>