[RFC 1/4] gfs2: Revert readahead conversion

Andreas Gruenbacher <agruenba@xxxxxxxxxx> · Thu, 2 Jul 2020 18:51:17 +0200

Commit d4388340ae0b ("fs: convert mpage_readpages to mpage_readahead")
converted gfs2 and other filesystems from the ->readpages to the
->readahead address space operation.  Other than ->readpages,
->readahead is passed the pages to readahead locked.  Due to problems in
the current page locking strategy, this is now causing deadlocks in
gfs2.

Fix this by reinstating mpage_readpages from before commit d4388340ae0b
and by converting gfs2 back to ->readpages.

Signed-off-by: Andreas Gruenbacher <agruenba@xxxxxxxxxx>
---
 fs/gfs2/aops.c        | 23 ++++++++-----
 fs/mpage.c            | 75 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mpage.h |  2 ++
 3 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..786c1ce8f030 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
 }
 
 /**
- * gfs2_readahead - Read a bunch of pages at once
+ * gfs2_readpages - Read a bunch of pages at once
  * @file: The file to read from
  * @mapping: Address space info
  * @pages: List of pages to read
@@ -590,24 +590,31 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
  *    obviously not something we'd want to do on too regular a basis.
  *    Any I/O we ignore at this time will be done via readpage later.
  * 2. We don't handle stuffed files here we let readpage do the honours.
- * 3. mpage_readahead() does most of the heavy lifting in the common case.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
  * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
  */
 
-static void gfs2_readahead(struct readahead_control *rac)
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
 {
-	struct inode *inode = rac->mapping->host;
+	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder gh;
+	int ret;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	if (gfs2_glock_nq(&gh))
+	ret = gfs2_glock_nq(&gh);
+	if (unlikely(ret))
 		goto out_uninit;
 	if (!gfs2_is_stuffed(ip))
-		mpage_readahead(rac, gfs2_block_map);
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
 	gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
+	if (unlikely(gfs2_withdrawn(sdp)))
+		ret = -EIO;
+	return ret;
 }
 
 /**
@@ -826,7 +833,7 @@ static const struct address_space_operations gfs2_aops = {
 	.writepage = gfs2_writepage,
 	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
-	.readahead = gfs2_readahead,
+	.readpages = gfs2_readpages,
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
@@ -840,7 +847,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.writepage = gfs2_jdata_writepage,
 	.writepages = gfs2_jdata_writepages,
 	.readpage = gfs2_readpage,
-	.readahead = gfs2_readahead,
+	.readpages = gfs2_readpages,
 	.set_page_dirty = jdata_set_page_dirty,
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
diff --git a/fs/mpage.c b/fs/mpage.c
index 830e6cc2a9e7..5243a065a062 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -396,6 +396,81 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
 }
 EXPORT_SYMBOL(mpage_readahead);
 
+/**
+ * mpage_readpages - populate an address space with some pages & start reads against them
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ *   The page at @pages->prev has the lowest file offset, and reads should be
+ *   issued in @pages->prev to @pages->next order.
+ * @nr_pages: The number of pages at *@pages
+ * @get_block: The filesystem's block mapper function.
+ *
+ * This function walks the pages and the blocks within each page, building and
+ * emitting large BIOs.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_SIZE setups.
+ *
+ * BH_Boundary explanation:
+ *
+ * There is a problem.  The mpage read code assembles several pages, gets all
+ * their disk mappings, and then submits them all.  That's fine, but obtaining
+ * the disk mappings may require I/O.  Reads of indirect blocks, for example.
+ *
+ * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
+ * submitted in the following order:
+ *
+ * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
+ *
+ * because the indirect block has to be read to get the mappings of blocks
+ * 13,14,15,16.  Obviously, this impacts performance.
+ *
+ * So what we do it to allow the filesystem's get_block() function to set
+ * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
+ * after this one will require I/O against a block which is probably close to
+ * this one.  So you should push what I/O you have currently accumulated.
+ *
+ * This all causes the disk requests to be issued in the correct order.
+ */
+int
+mpage_readpages(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block)
+{
+	struct mpage_readpage_args args = {
+		.get_block = get_block,
+		.is_readahead = true,
+	};
+	unsigned page_idx;
+
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, mapping,
+					page->index,
+					readahead_gfp_mask(mapping))) {
+			args.page = page;
+			args.nr_pages = nr_pages - page_idx;
+			args.bio = do_mpage_readpage(&args);
+		}
+		put_page(page);
+	}
+	BUG_ON(!list_empty(pages));
+	if (args.bio)
+		mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
+	return 0;
+}
+EXPORT_SYMBOL(mpage_readpages);
+
 /*
  * This isn't called much at all
  */
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index f4f5e90a6844..181f1b0fbd83 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -16,6 +16,8 @@ struct writeback_control;
 struct readahead_control;
 
 void mpage_readahead(struct readahead_control *, get_block_t get_block);
+int mpage_readpages(struct address_space *, struct list_head *, unsigned,
+		    get_block_t);
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
-- 
2.26.2