I'm coming back to this ancient discussion, because I've actually been running that odd small-reads patch on my machine since February, because it's been in my random pile of "local test patches". I had to rebase the patch because it conflicted with a recent fix, and as part of rebasing it I looked at it again. And I did find that the "do small reads entirely under RCU" didn't test for one condition that filemap_get_read_batch() checked for: the xa_is_sibling() check. So... On Tue, 14 May 2024 at 04:52, Luis Chamberlain <mcgrof@xxxxxxxxxx> wrote: > > On Mon, Feb 26, 2024 at 02:46:56PM -0800, Linus Torvalds wrote: > > I really haven't tested this AT ALL. I'm much too scared. But I don't > > actually hate how the code looks nearly as much as I *thought* I'd > > hate it. > > Thanks for this, obviously those interested in this will have to test > this and fix the below issues. I've tested for regressions just against > xfs on 4k reflink profile and detected only two failures, generic/095 > fails with a failure rate of about 1/2 or so: > > * generic/095 > * generic/741 Would you mind seeing if that ancient patch with the xa_is_sibling() check added, and rebased to be on top of current -git maybe works for you now? I still haven't "really" tested this in any real way, except for running all my normal desktop loads on it. Which isn't really saying much. I compile kernels, and read email. That's pretty much all I do. So no fstests or anything like that. But it *has* worked in that capacity for 8+ months now. So it's not entirely broken, but the fact that it failed fstests for you clearly means that *something* was subtly but horribly broken in the original version. Linus
From 2dafd15ceb00a41b9f6b37c1d84662039ba2d140 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Date: Mon, 26 Feb 2024 15:18:44 -0800 Subject: [PATCH] mm/filemap: do small reads in RCU-mode read without refcounts Hackety hack hack concept from report by Willy. Mommy, I'm scared. Link: https://lore.kernel.org/all/Zduto30LUEqIHg4h@xxxxxxxxxxxxxxxxxxxx/ Not-yet-signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> --- mm/filemap.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 10 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 56fa431c52af..91dd09c43af5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2594,6 +2594,85 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +/* + * I can't be bothered to care about HIGHMEM for the fast read case + */ +#ifdef CONFIG_HIGHMEM +#define filemap_fast_read(mapping, pos, buffer, size) 0 +#else + +/* + * Called under RCU with size limited to the file size and one page + */ +static inline unsigned long filemap_folio_copy_rcu(struct address_space *mapping, loff_t pos, char *buffer, size_t size) +{ + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT); + struct folio *folio; + size_t offset; + + xas_reset(&xas); + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) + return 0; + + if (!folio || xa_is_value(folio) || xa_is_sibling(folio)) + return 0; + + if (!folio_test_uptodate(folio)) + return 0; + + /* No fast-case if we are supposed to start readahead */ + if (folio_test_readahead(folio)) + return 0; + /* .. or mark it accessed */ + if (!folio_test_referenced(folio)) + return 0; + + /* Do the data copy */ + offset = pos & (folio_size(folio) - 1); + memcpy(buffer, folio_address(folio) + offset, size); + + /* We should probably do some silly memory barrier here */ + if (unlikely(folio != xas_reload(&xas))) + return 0; + + return size; +} + +/* + * Iff we can complete the read completely in one atomic go under RCU, + * do so here. Otherwise return 0 (no partial reads, please - this is + * purely for the trivial fast case). + */ +static inline unsigned long filemap_fast_read(struct address_space *mapping, loff_t pos, char *buffer, size_t size) +{ + struct inode *inode; + loff_t file_size; + unsigned long pgoff; + + /* Don't even try for page-crossers */ + pgoff = pos & ~PAGE_MASK; + if (pgoff + size > PAGE_SIZE) + return 0; + + /* Limit it to the file size */ + inode = mapping->host; + file_size = i_size_read(inode); + if (unlikely(pos >= file_size)) + return 0; + file_size -= pos; + if (file_size < size) + size = file_size; + + /* Let's see if we can just do the read under RCU */ + rcu_read_lock(); + size = filemap_folio_copy_rcu(mapping, pos, buffer, size); + rcu_read_unlock(); + + return size; +} +#endif /* !HIGHMEM */ + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2614,7 +2693,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct folio_batch fbatch; + union { + struct folio_batch fbatch; + __DECLARE_FLEX_ARRAY(char, buffer); + } area; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2626,7 +2708,22 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); - folio_batch_init(&fbatch); + + if (iov_iter_count(iter) <= sizeof(area)) { + unsigned long count = iov_iter_count(iter); + + count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count); + if (count) { + size_t copied = copy_to_iter(area.buffer, count, iter); + if (unlikely(!copied)) + return already_read ? already_read : -EFAULT; + ra->prev_pos = iocb->ki_pos += copied; + file_accessed(filp); + return copied + already_read; + } + } + + folio_batch_init(&area.fbatch); do { cond_resched(); @@ -2642,7 +2739,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter->count, &fbatch, false); + error = filemap_get_pages(iocb, iter->count, &area.fbatch, false); if (error < 0) break; @@ -2670,11 +2767,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, - fbatch.folios[0])) - folio_mark_accessed(fbatch.folios[0]); + area.fbatch.folios[0])) + folio_mark_accessed(area.fbatch.folios[0]); - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; + for (i = 0; i < folio_batch_count(&area.fbatch); i++) { + struct folio *folio = area.fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, @@ -2705,9 +2802,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); - folio_batch_init(&fbatch); + for (i = 0; i < folio_batch_count(&area.fbatch); i++) + folio_put(area.fbatch.folios[i]); + folio_batch_init(&area.fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp);