Re: [RFC 2/8] shmem: convert to use folio_test_hwpoison()

Luis Chamberlain <mcgrof@xxxxxxxxxx> · Wed, 26 Apr 2023 00:43:10 -0700

On Tue, Apr 25, 2023 at 03:47:24PM -0700, Luis Chamberlain wrote:
> On Tue, Apr 25, 2023 at 01:00:25PM +0200, Pankaj Raghav wrote:
> > On Fri, Apr 21, 2023 at 11:42:53PM +0100, Matthew Wilcox wrote:
> > > On Fri, Apr 21, 2023 at 02:43:54PM -0700, Luis Chamberlain wrote:
> > > > The PageHWPoison() call can be converted over to the respective folio call
> > > > folio_test_hwpoison(). This introduces no functional changes.
> > > 
> > > Um, no.  Nobody should use folio_test_hwpoison(), it's a nonsense.
> > > 
> > > Individual pages are hwpoisoned.  You're only testing the head page
> > > if you use folio_test_hwpoison().  There's folio_has_hwpoisoned() to
> > > test if _any_ page in the folio is poisoned.  But blindly converting
> > > PageHWPoison to folio_test_hwpoison() is wrong.
> > 
> > I see a pattern in shmem.c where first the head is tested and for large
> > folios, any of pages in the folio is tested for poison flag. Should we
> > factor it out as a helper in shmem.c and use it here?
> > 
> > static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
> > ...
> > 	if (folio_test_hwpoison(folio) ||
> > 	    (folio_test_large(folio) &&
> > 	     folio_test_has_hwpoisoned(folio))) {
> > 	..
> 
> Hugh's commit 72887c976a7c9e ("shmem: minor fixes to splice-read
> implementation") is on point about this :
> 
>   "Perhaps that ugliness can be improved at the mm end later"
> 
> So how about we put some lipstick on this guy now (notice right above it
> a similar compound page check for is_page_hwpoison()):
> 
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 1c68d67b832f..6a4a571dbe50 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -883,6 +883,13 @@ static inline bool is_page_hwpoison(struct page *page)
>  	return PageHuge(page) && PageHWPoison(compound_head(page));
>  }
>  
> +static inline bool is_folio_hwpoison(struct folio *folio)
> +{
> +	if (folio_test_hwpoison(folio))
> +		return true;
> +	return folio_test_large(folio) && folio_test_has_hwpoisoned(folio);
> +}
> +
>  /*
>   * For pages that are never mapped to userspace (and aren't PageSlab),
>   * page_type may be used.  Because it is initialised to -1, we invert the
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ef7ad684f4fb..b7f47f6b75d5 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -3013,9 +3013,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
>  		if (folio) {
>  			folio_unlock(folio);
>  
> -			if (folio_test_hwpoison(folio) ||
> -			    (folio_test_large(folio) &&
> -			     folio_test_has_hwpoisoned(folio))) {
> +			if (is_folio_hwpoison(folio)) {
>  				error = -EIO;
>  				break;
>  			}

With this, this I end up with the following for shmem_file_read_iter().
For some odd reason without the first hunk I see that some non SB_KERNMOUNT
end up with a silly inode->i_blkbits.

I must be doing something wrong with the shmem_file_read_iter() conversion
as I end up with a empty new line at the end, but I can't seem to
understand why. Any ideas?

diff --git a/mm/shmem.c b/mm/shmem.c
index 21a4b8522ac5..39ae17774dc3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2541,6 +2541,10 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
 		inode->i_ino = ino;
 		inode_init_owner(idmap, inode, dir, mode);
 		inode->i_blocks = 0;
+		if (sb->s_flags & SB_KERNMOUNT)
+			inode->i_blkbits = PAGE_SHIFT;
+		else
+			inode->i_blkbits = sb->s_blocksize_bits;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inode->i_generation = get_random_u32();
 		info = SHMEM_I(inode);
@@ -2786,18 +2790,23 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
+	struct super_block *sb = inode->i_sb;
+	u64 bsize = i_blocksize(inode);
 	pgoff_t index;
 	unsigned long offset;
 	int error = 0;
 	ssize_t retval = 0;
 	loff_t *ppos = &iocb->ki_pos;
 
+	/*
+	 * Although our index is page specific, we can read a blocksize at a
+	 * time as we use a folio per block.
+	 */
 	index = *ppos >> PAGE_SHIFT;
-	offset = *ppos & ~PAGE_MASK;
+	offset = *ppos & (bsize - 1);
 
 	for (;;) {
 		struct folio *folio = NULL;
-		struct page *page = NULL;
 		pgoff_t end_index;
 		unsigned long nr, ret;
 		loff_t i_size = i_size_read(inode);
@@ -2806,7 +2815,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		if (index > end_index)
 			break;
 		if (index == end_index) {
-			nr = i_size & ~PAGE_MASK;
+			nr = i_size & (bsize - 1);
 			if (nr <= offset)
 				break;
 		}
@@ -2819,9 +2828,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		}
 		if (folio) {
 			folio_unlock(folio);
-
-			page = folio_file_page(folio, index);
-			if (PageHWPoison(page)) {
+			if (is_folio_hwpoison(folio)) {
 				folio_put(folio);
 				error = -EIO;
 				break;
@@ -2831,50 +2838,63 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		/*
 		 * We must evaluate after, since reads (unlike writes)
 		 * are called without i_rwsem protection against truncate
+		 *
+		 * nr represents the number of bytes we can read per folio,
+		 * and this will depend on the blocksize set.
 		 */
-		nr = PAGE_SIZE;
+		nr = bsize;
+		WARN_ON(!(sb->s_flags & SB_KERNMOUNT) && folio && bsize != folio_size(folio));
 		i_size = i_size_read(inode);
 		end_index = i_size >> PAGE_SHIFT;
 		if (index == end_index) {
-			nr = i_size & ~PAGE_MASK;
+			nr = i_size & (bsize - 1);
 			if (nr <= offset) {
 				if (folio)
 					folio_put(folio);
 				break;
 			}
 		}
+		/*
+		 * On the first folio read this will amount to blocksize - offset. On subsequent
+		 * reads we can read blocksize at time until iov_iter_count(to) == 0.
+		 *
+		 * The offset represents the base we'll use to do the reads per folio, it
+		 * gets incremented by the number of bytes we read per folio and is aligned
+		 * to the blocksize. After a first offset block the offset would be 0 and
+		 * we'd read a block at a time.
+		 */
 		nr -= offset;
 
 		if (folio) {
 			/*
-			 * If users can be writing to this page using arbitrary
+			 * If users can be writing to this folio using arbitrary
 			 * virtual addresses, take care about potential aliasing
-			 * before reading the page on the kernel side.
+			 * before reading the folio on the kernel side.
 			 */
 			if (mapping_writably_mapped(mapping))
-				flush_dcache_page(page);
+				flush_dcache_folio(folio);
 			/*
-			 * Mark the page accessed if we read the beginning.
+			 * Mark the folio accessed if we read the beginning.
 			 */
 			if (!offset)
 				folio_mark_accessed(folio);
 			/*
-			 * Ok, we have the page, and it's up-to-date, so
+			 * Ok, we have the folio, and it's up-to-date, so
 			 * now we can copy it to user space...
 			 */
-			ret = copy_page_to_iter(page, offset, nr, to);
+			ret = copy_folio_to_iter(folio, offset, nr, to);
 			folio_put(folio);
 
 		} else if (user_backed_iter(to)) {
 			/*
 			 * Copy to user tends to be so well optimized, but
 			 * clear_user() not so much, that it is noticeably
-			 * faster to copy the zero page instead of clearing.
+			 * faster to copy the zero folio instead of clearing.
 			 */
-			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
+			ret = copy_folio_to_iter(page_folio(ZERO_PAGE(0)), offset, nr, to);
 		} else {
 			/*
-			 * But submitting the same page twice in a row to
+			 * But submitting the same folio twice in a row to
 			 * splice() - or others? - can result in confusion:
 			 * so don't attempt that optimization on pipes etc.
 			 */
@@ -2883,8 +2903,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 		retval += ret;
 		offset += ret;
+
+		/*
+		 * Due to usage of folios per blocksize we know this will actually read
+		 * blocksize at a time after the first block read at offset.
+		 */
 		index += offset >> PAGE_SHIFT;
-		offset &= ~PAGE_MASK;
+		offset &= (bsize - 1);
 
 		if (!iov_iter_count(to))
 			break;
-- 
2.39.2