Hi Linus, Does this version look good to you? Would you please consider to apply this to -rc2? Or you prefer to get it from Andrew? Thanks, Yang On Tue, Nov 16, 2021 at 11:32 AM Yang Shi <shy828301@xxxxxxxxx> wrote: > > The current behavior of memory failure is to truncate the page cache > regardless of dirty or clean. If the page is dirty the later access > will get the obsolete data from disk without any notification to the > users. This may cause silent data loss. It is even worse for shmem > since shmem is in-memory filesystem, truncating page cache means > discarding data blocks. The later read would return all zero. > > The right approach is to keep the corrupted page in page cache, any > later access would return error for syscalls or SIGBUS for page fault, > until the file is truncated, hole punched or removed. The regular > storage backed filesystems would be more complicated so this patch is > focused on shmem. This also unblock the support for soft offlining > shmem THP. > > [arnd@xxxxxxxx: fix uninitialized variable use in me_pagecache_clean()] > Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@xxxxxxxxxx > [Fix invalid pointer dereference in shmem_read_mapping_page_gfp() with a > slight different implementation from what Ajay Garg <ajaygargnsit@xxxxxxxxx> > and Muchun Song <songmuchun@xxxxxxxxxxxxx> proposed and reworked the > error handling of shmem_write_begin() suggested by Linus] > Link: https://lore.kernel.org/linux-mm/20211111084617.6746-1-ajaygargnsit@xxxxxxxxx/ > > Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@xxxxxxxxx > Signed-off-by: Yang Shi <shy828301@xxxxxxxxx> > Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx> > Cc: Hugh Dickins <hughd@xxxxxxxxxx> > Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > Cc: Naoya Horiguchi <naoya.horiguchi@xxxxxxx> > Cc: Oscar Salvador <osalvador@xxxxxxx> > Cc: Peter Xu <peterx@xxxxxxxxxx> > Cc: Ajay Garg <ajaygargnsit@xxxxxxxxx> > Cc: Muchun Song <songmuchun@xxxxxxxxxxxxx> > --- > v2: Incorporated the suggestions from Willy. > v1: Fixed pointer dereference oops reported by Ajay Garg and reworked the > error handling of shmem_write_begin() suggested by Linus. > > mm/memory-failure.c | 14 ++++++++++--- > mm/shmem.c | 51 +++++++++++++++++++++++++++++++++++++++------ > mm/userfaultfd.c | 5 +++++ > 3 files changed, 61 insertions(+), 9 deletions(-) > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > index 07c875fdeaf0..f64ebb6226cb 100644 > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -58,6 +58,7 @@ > #include <linux/ratelimit.h> > #include <linux/page-isolation.h> > #include <linux/pagewalk.h> > +#include <linux/shmem_fs.h> > #include "internal.h" > #include "ras/ras_event.h" > > @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) > { > int ret; > struct address_space *mapping; > + bool extra_pins; > > delete_from_lru_cache(p); > > @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) > goto out; > } > > + /* > + * The shmem page is kept in page cache instead of truncating > + * so is expected to have an extra refcount after error-handling. > + */ > + extra_pins = shmem_mapping(mapping); > + > /* > * Truncation is a bit tricky. Enable it per file system for now. > * > * Open: to take i_rwsem or not for this? Right now we don't. > */ > ret = truncate_error_page(p, page_to_pfn(p), mapping); > + if (has_extra_refcount(ps, p, extra_pins)) > + ret = MF_FAILED; > + > out: > unlock_page(p); > > - if (has_extra_refcount(ps, p, false)) > - ret = MF_FAILED; > - > return ret; > } > > diff --git a/mm/shmem.c b/mm/shmem.c > index dc038ce78700..2ba64918132b 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -2456,6 +2456,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, > struct inode *inode = mapping->host; > struct shmem_inode_info *info = SHMEM_I(inode); > pgoff_t index = pos >> PAGE_SHIFT; > + int ret = 0; > > /* i_rwsem is held by caller */ > if (unlikely(info->seals & (F_SEAL_GROW | > @@ -2466,7 +2467,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, > return -EPERM; > } > > - return shmem_getpage(inode, index, pagep, SGP_WRITE); > + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); > + > + if (ret) > + return ret; > + > + if (PageHWPoison(*pagep)) { > + unlock_page(*pagep); > + put_page(*pagep); > + *pagep = NULL; > + return -EIO; > + } > + > + return 0; > } > > static int > @@ -2553,6 +2566,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > if (sgp == SGP_CACHE) > set_page_dirty(page); > unlock_page(page); > + > + if (PageHWPoison(page)) { > + put_page(page); > + error = -EIO; > + break; > + } > } > > /* > @@ -3092,7 +3111,8 @@ static const char *shmem_get_link(struct dentry *dentry, > page = find_get_page(inode->i_mapping, 0); > if (!page) > return ERR_PTR(-ECHILD); > - if (!PageUptodate(page)) { > + if (PageHWPoison(page) || > + !PageUptodate(page)) { > put_page(page); > return ERR_PTR(-ECHILD); > } > @@ -3100,6 +3120,13 @@ static const char *shmem_get_link(struct dentry *dentry, > error = shmem_getpage(inode, 0, &page, SGP_READ); > if (error) > return ERR_PTR(error); > + if (!page) > + return ERR_PTR(-ECHILD); > + if (PageHWPoison(page)) { > + unlock_page(page); > + put_page(page); > + return ERR_PTR(-ECHILD); > + } > unlock_page(page); > } > set_delayed_call(done, shmem_put_link, page); > @@ -3750,6 +3777,13 @@ static void shmem_destroy_inodecache(void) > kmem_cache_destroy(shmem_inode_cachep); > } > > +/* Keep the page in page cache instead of truncating it */ > +static int shmem_error_remove_page(struct address_space *mapping, > + struct page *page) > +{ > + return 0; > +} > + > const struct address_space_operations shmem_aops = { > .writepage = shmem_writepage, > .set_page_dirty = __set_page_dirty_no_writeback, > @@ -3760,7 +3794,7 @@ const struct address_space_operations shmem_aops = { > #ifdef CONFIG_MIGRATION > .migratepage = migrate_page, > #endif > - .error_remove_page = generic_error_remove_page, > + .error_remove_page = shmem_error_remove_page, > }; > EXPORT_SYMBOL(shmem_aops); > > @@ -4168,9 +4202,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, > error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, > gfp, NULL, NULL, NULL); > if (error) > - page = ERR_PTR(error); > - else > - unlock_page(page); > + return ERR_PTR(error); > + > + unlock_page(page); > + if (PageHWPoison(page)) { > + put_page(page); > + return ERR_PTR(-EIO); > + } > + > return page; > #else > /* > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index ac6f036298cd..0780c2a57ff1 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, > goto out; > } > > + if (PageHWPoison(page)) { > + ret = -EIO; > + goto out_release; > + } > + > ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, > page, false, wp_copy); > if (ret) > -- > 2.26.2 >