Re: [LSF/MM/BPF TOPIC] Measuring limits and enhancing buffered IO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I'm coming back to this ancient discussion, because I've actually been
running that odd small-reads patch on my machine since February,
because it's been in my random pile of "local test patches".

I had to rebase the patch because it conflicted with a recent fix, and
as part of rebasing it I looked at it again.

And I did find that the "do small reads entirely under RCU" didn't
test for one condition that filemap_get_read_batch() checked for: the
xa_is_sibling() check.

So...

On Tue, 14 May 2024 at 04:52, Luis Chamberlain <mcgrof@xxxxxxxxxx> wrote:
>
> On Mon, Feb 26, 2024 at 02:46:56PM -0800, Linus Torvalds wrote:
> > I really haven't tested this AT ALL. I'm much too scared. But I don't
> > actually hate how the code looks nearly as much as I *thought* I'd
> > hate it.
>
> Thanks for this, obviously those interested in this will have to test
> this and fix the below issues. I've tested for regressions just against
> xfs on 4k reflink profile and detected only two failures, generic/095
> fails with a failure rate of about 1/2 or so:
>
>   * generic/095
>   * generic/741

Would you mind seeing if that ancient patch with the xa_is_sibling()
check added, and rebased to be on top of current -git maybe works for
you now?

I still haven't "really" tested this in any real way, except for
running all my normal desktop loads on it. Which isn't really saying
much. I compile kernels, and read email. That's pretty much all I do.
So no fstests or anything like that.

But it *has* worked in that capacity for 8+ months now. So it's not
entirely broken, but the fact that it failed fstests for you clearly
means that *something* was subtly but horribly broken in the original
version.

               Linus
From 2dafd15ceb00a41b9f6b37c1d84662039ba2d140 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Mon, 26 Feb 2024 15:18:44 -0800
Subject: [PATCH] mm/filemap: do small reads in RCU-mode read without refcounts

Hackety hack hack concept from report by Willy.

Mommy, I'm scared.

Link: https://lore.kernel.org/all/Zduto30LUEqIHg4h@xxxxxxxxxxxxxxxxxxxx/
Not-yet-signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
 mm/filemap.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 10 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 56fa431c52af..91dd09c43af5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2594,6 +2594,85 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
 	return (pos1 >> shift == pos2 >> shift);
 }
 
+/*
+ * I can't be bothered to care about HIGHMEM for the fast read case
+ */
+#ifdef CONFIG_HIGHMEM
+#define filemap_fast_read(mapping, pos, buffer, size) 0
+#else
+
+/*
+ * Called under RCU with size limited to the file size and one page
+ */
+static inline unsigned long filemap_folio_copy_rcu(struct address_space *mapping, loff_t pos, char *buffer, size_t size)
+{
+	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+	struct folio *folio;
+	size_t offset;
+
+	xas_reset(&xas);
+	folio = xas_load(&xas);
+	if (xas_retry(&xas, folio))
+		return 0;
+
+	if (!folio || xa_is_value(folio) || xa_is_sibling(folio))
+		return 0;
+
+	if (!folio_test_uptodate(folio))
+		return 0;
+
+	/* No fast-case if we are supposed to start readahead */
+	if (folio_test_readahead(folio))
+		return 0;
+	/* .. or mark it accessed */
+	if (!folio_test_referenced(folio))
+		return 0;
+
+	/* Do the data copy */
+	offset = pos & (folio_size(folio) - 1);
+	memcpy(buffer, folio_address(folio) + offset, size);
+
+	/* We should probably do some silly memory barrier here */
+	if (unlikely(folio != xas_reload(&xas)))
+		return 0;
+
+	return size;
+}
+
+/*
+ * Iff we can complete the read completely in one atomic go under RCU,
+ * do so here. Otherwise return 0 (no partial reads, please - this is
+ * purely for the trivial fast case).
+ */
+static inline unsigned long filemap_fast_read(struct address_space *mapping, loff_t pos, char *buffer, size_t size)
+{
+	struct inode *inode;
+	loff_t file_size;
+	unsigned long pgoff;
+
+	/* Don't even try for page-crossers */
+	pgoff = pos & ~PAGE_MASK;
+	if (pgoff + size > PAGE_SIZE)
+		return 0;
+
+	/* Limit it to the file size */
+	inode = mapping->host;
+	file_size = i_size_read(inode);
+	if (unlikely(pos >= file_size))
+		return 0;
+	file_size -= pos;
+	if (file_size < size)
+		size = file_size;
+
+	/* Let's see if we can just do the read under RCU */
+	rcu_read_lock();
+	size = filemap_folio_copy_rcu(mapping, pos, buffer, size);
+	rcu_read_unlock();
+
+	return size;
+}
+#endif /* !HIGHMEM */
+
 /**
  * filemap_read - Read data from the page cache.
  * @iocb: The iocb to read.
@@ -2614,7 +2693,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+	} area;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
@@ -2626,7 +2708,22 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+
+	if (iov_iter_count(iter) <= sizeof(area)) {
+		unsigned long count = iov_iter_count(iter);
+
+		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
+		if (count) {
+			size_t copied = copy_to_iter(area.buffer, count, iter);
+			if (unlikely(!copied))
+				return already_read ? already_read : -EFAULT;
+			ra->prev_pos = iocb->ki_pos += copied;
+			file_accessed(filp);
+			return copied + already_read;
+		}
+	}
+
+	folio_batch_init(&area.fbatch);
 
 	do {
 		cond_resched();
@@ -2642,7 +2739,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, &area.fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2670,11 +2767,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    area.fbatch.folios[0]))
+			folio_mark_accessed(area.fbatch.folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+			struct folio *folio = area.fbatch.folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2705,9 +2802,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++)
-			folio_put(fbatch.folios[i]);
-		folio_batch_init(&fbatch);
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++)
+			folio_put(area.fbatch.folios[i]);
+		folio_batch_init(&area.fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux