Patch "erofs: handle overlapped pclusters out of crafted images properly" has been added to the 6.10-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Mon, 30 Sep 2024 19:32:55 -0400

This is a note to let you know that I've just added the patch titled

    erofs: handle overlapped pclusters out of crafted images properly

to the 6.10-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     erofs-handle-overlapped-pclusters-out-of-crafted-ima.patch
and it can be found in the queue-6.10 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit b1eb5ec8e4f2b1239be2d9e728c19b9531d41687
Author: Gao Xiang <xiang@xxxxxxxxxx>
Date:   Tue Sep 10 15:08:47 2024 +0800

    erofs: handle overlapped pclusters out of crafted images properly
    
    [ Upstream commit 9e2f9d34dd12e6e5b244ec488bcebd0c2d566c50 ]
    
    syzbot reported a task hang issue due to a deadlock case where it is
    waiting for the folio lock of a cached folio that will be used for
    cache I/Os.
    
    After looking into the crafted fuzzed image, I found it's formed with
    several overlapped big pclusters as below:
    
     Ext:   logical offset   |  length :     physical offset    |  length
       0:        0..   16384 |   16384 :     151552..    167936 |   16384
       1:    16384..   32768 |   16384 :     155648..    172032 |   16384
       2:    32768..   49152 |   16384 :  537223168.. 537239552 |   16384
    ...
    
    Here, extent 0/1 are physically overlapped although it's entirely
    _impossible_ for normal filesystem images generated by mkfs.
    
    First, managed folios containing compressed data will be marked as
    up-to-date and then unlocked immediately (unlike in-place folios) when
    compressed I/Os are complete.  If physical blocks are not submitted in
    the incremental order, there should be separate BIOs to avoid dependency
    issues.  However, the current code mis-arranges z_erofs_fill_bio_vec()
    and BIO submission which causes unexpected BIO waits.
    
    Second, managed folios will be connected to their own pclusters for
    efficient inter-queries.  However, this is somewhat hard to implement
    easily if overlapped big pclusters exist.  Again, these only appear in
    fuzzed images so let's simply fall back to temporary short-lived pages
    for correctness.
    
    Additionally, it justifies that referenced managed folios cannot be
    truncated for now and reverts part of commit 2080ca1ed3e4 ("erofs: tidy
    up `struct z_erofs_bvec`") for simplicity although it shouldn't be any
    difference.
    
    Reported-by: syzbot+4fc98ed414ae63d1ada2@xxxxxxxxxxxxxxxxxxxxxxxxx
    Reported-by: syzbot+de04e06b28cfecf2281c@xxxxxxxxxxxxxxxxxxxxxxxxx
    Reported-by: syzbot+c8c8238b394be4a1087d@xxxxxxxxxxxxxxxxxxxxxxxxx
    Tested-by: syzbot+4fc98ed414ae63d1ada2@xxxxxxxxxxxxxxxxxxxxxxxxx
    Closes: https://lore.kernel.org/r/0000000000002fda01061e334873@xxxxxxxxxx
    Fixes: 8e6c8fa9f2e9 ("erofs: enable big pcluster feature")
    Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxxxxxxxxx>
    Link: https://lore.kernel.org/r/20240910070847.3356592-1-hsiangkao@xxxxxxxxxxxxxxxxx
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index d269b092c477f..2535479fb82db 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1416,6 +1416,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 	struct z_erofs_bvec zbv;
 	struct address_space *mapping;
 	struct folio *folio;
+	struct page *page;
 	int bs = i_blocksize(f->inode);
 
 	/* Except for inplace folios, the entire folio can be used for I/Os */
@@ -1438,7 +1439,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 	 * file-backed folios will be used instead.
 	 */
 	if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
-		folio->private = 0;
 		tocache = true;
 		goto out_tocache;
 	}
@@ -1456,7 +1456,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 	}
 
 	folio_lock(folio);
-	if (folio->mapping == mc) {
+	if (likely(folio->mapping == mc)) {
 		/*
 		 * The cached folio is still in managed cache but without
 		 * a valid `->private` pcluster hint.  Let's reconnect them.
@@ -1466,41 +1466,44 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 			/* compressed_bvecs[] already takes a ref before */
 			folio_put(folio);
 		}
-
-		/* no need to submit if it is already up-to-date */
-		if (folio_test_uptodate(folio)) {
-			folio_unlock(folio);
-			bvec->bv_page = NULL;
+		if (likely(folio->private == pcl))  {
+			/* don't submit cache I/Os again if already uptodate */
+			if (folio_test_uptodate(folio)) {
+				folio_unlock(folio);
+				bvec->bv_page = NULL;
+			}
+			return;
 		}
-		return;
+		/*
+		 * Already linked with another pcluster, which only appears in
+		 * crafted images by fuzzers for now.  But handle this anyway.
+		 */
+		tocache = false;	/* use temporary short-lived pages */
+	} else {
+		DBG_BUGON(1); /* referenced managed folios can't be truncated */
+		tocache = true;
 	}
-
-	/*
-	 * It has been truncated, so it's unsafe to reuse this one. Let's
-	 * allocate a new page for compressed data.
-	 */
-	DBG_BUGON(folio->mapping);
-	tocache = true;
 	folio_unlock(folio);
 	folio_put(folio);
 out_allocfolio:
-	zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+	page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
 	spin_lock(&pcl->obj.lockref.lock);
-	if (pcl->compressed_bvecs[nr].page) {
-		erofs_pagepool_add(&f->pagepool, zbv.page);
+	if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+		erofs_pagepool_add(&f->pagepool, page);
 		spin_unlock(&pcl->obj.lockref.lock);
 		cond_resched();
 		goto repeat;
 	}
-	bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page;
-	folio = page_folio(zbv.page);
-	/* first mark it as a temporary shortlived folio (now 1 ref) */
-	folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+	bvec->bv_page = pcl->compressed_bvecs[nr].page = page;
+	folio = page_folio(page);
 	spin_unlock(&pcl->obj.lockref.lock);
 out_tocache:
 	if (!tocache || bs != PAGE_SIZE ||
-	    filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp))
+	    filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+		/* turn into a temporary shortlived folio (1 ref) */
+		folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
 		return;
+	}
 	folio_attach_private(folio, pcl);
 	/* drop a refcount added by allocpage (then 2 refs in total here) */
 	folio_put(folio);
@@ -1635,13 +1638,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 		cur = mdev.m_pa;
 		end = cur + pcl->pclustersize;
 		do {
-			z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
-			if (!bvec.bv_page)
-				continue;
-
+			bvec.bv_page = NULL;
 			if (bio && (cur != last_pa ||
 				    bio->bi_bdev != mdev.m_bdev)) {
-io_retry:
+drain_io:
 				if (!erofs_is_fscache_mode(sb))
 					submit_bio(bio);
 				else
@@ -1654,6 +1654,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 				bio = NULL;
 			}
 
+			if (!bvec.bv_page) {
+				z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+				if (!bvec.bv_page)
+					continue;
+				if (cur + bvec.bv_len > end)
+					bvec.bv_len = end - cur;
+				DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+			}
+
 			if (unlikely(PageWorkingset(bvec.bv_page)) &&
 			    !memstall) {
 				psi_memstall_enter(&pflags);
@@ -1673,13 +1682,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 				++nr_bios;
 			}
 
-			if (cur + bvec.bv_len > end)
-				bvec.bv_len = end - cur;
-			DBG_BUGON(bvec.bv_len < sb->s_blocksize);
 			if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
 					  bvec.bv_offset))
-				goto io_retry;
-
+				goto drain_io;
 			last_pa = cur + bvec.bv_len;
 			bypass = false;
 		} while ((cur += bvec.bv_len) < end);