Should the BUG() be changed to a WARN() or warn once? ./scripts/checkpatch.pl fs/smb/client/0001-cifs-Fix-writeback-data-corruption.patch WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants #205: FILE: fs/smb/client/file.c:2758: + BUG(); total: 0 errors, 1 warnings, 439 lines checked On Thu, Feb 22, 2024 at 5:20 AM David Howells <dhowells@xxxxxxxxxx> wrote: > > cifs writeback doesn't correctly handle the case where > cifs_extend_writeback() hits a point where it is considering an additional > folio, but this would overrun the wsize - at which point it drops out of > the xarray scanning loop and calls xas_pause(). The problem is that > xas_pause() advances the loop counter - thereby skipping that page. > > What needs to happen is for xas_reset() to be called any time we decide we > don't want to process the page we're looking at, but rather send the > request we are building and start a new one. > > Fix this by copying and adapting the netfslib writepages code as a > temporary measure, with cifs writeback intending to be offloaded to > netfslib in the near future. > > This also fixes the issue with the use of filemap_get_folios_tag() causing > retry of a bunch of pages which the extender already dealt with. > > This can be tested by creating, say, a 64K file somewhere not on cifs > (otherwise copy-offload may get underfoot), mounting a cifs share with a > wsize of 64000, copying the file to it and then comparing the original file > and the copy: > > dd if=/dev/urandom of=/tmp/64K bs=64k count=1 > mount //192.168.6.1/test /mnt -o user=...,pass=...,wsize=64000 > cp /tmp/64K /mnt/64K > cmp /tmp/64K /mnt/64K > > Without the fix, the cmp fails at position 64000 (or shortly thereafter). > > Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list") > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Steve French <sfrench@xxxxxxxxx> > cc: Paulo Alcantara <pc@xxxxxxxxxxxxx> > cc: Ronnie Sahlberg <ronniesahlberg@xxxxxxxxx> > cc: Shyam Prasad N <sprasad@xxxxxxxxxxxxx> > cc: Tom Talpey <tom@xxxxxxxxxx> > cc: Jeff Layton <jlayton@xxxxxxxxxx> > cc: linux-cifs@xxxxxxxxxxxxxxx > cc: samba-technical@xxxxxxxxxxxxxxx > cc: netfs@xxxxxxxxxxxxxxx > cc: linux-fsdevel@xxxxxxxxxxxxxxx > --- > fs/smb/client/file.c | 284 ++++++++++++++++++++++++++++----------------------- > 1 file changed, 158 insertions(+), 126 deletions(-) > > diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c > index f391c9b803d8..671ce6ebd203 100644 > --- a/fs/smb/client/file.c > +++ b/fs/smb/client/file.c > @@ -2624,20 +2624,20 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) > * dirty pages if possible, but don't sleep while doing so. > */ > static void cifs_extend_writeback(struct address_space *mapping, > + struct xa_state *xas, > long *_count, > loff_t start, > int max_pages, > - size_t max_len, > - unsigned int *_len) > + loff_t max_len, > + size_t *_len) > { > struct folio_batch batch; > struct folio *folio; > - unsigned int psize, nr_pages; > - size_t len = *_len; > - pgoff_t index = (start + len) / PAGE_SIZE; > + unsigned int nr_pages; > + pgoff_t index = (start + *_len) / PAGE_SIZE; > + size_t len; > bool stop = true; > unsigned int i; > - XA_STATE(xas, &mapping->i_pages, index); > > folio_batch_init(&batch); > > @@ -2648,54 +2648,64 @@ static void cifs_extend_writeback(struct address_space *mapping, > */ > rcu_read_lock(); > > - xas_for_each(&xas, folio, ULONG_MAX) { > + xas_for_each(xas, folio, ULONG_MAX) { > stop = true; > - if (xas_retry(&xas, folio)) > + if (xas_retry(xas, folio)) > continue; > if (xa_is_value(folio)) > break; > - if (folio->index != index) > + if (folio->index != index) { > + xas_reset(xas); > break; > + } > + > if (!folio_try_get_rcu(folio)) { > - xas_reset(&xas); > + xas_reset(xas); > continue; > } > nr_pages = folio_nr_pages(folio); > - if (nr_pages > max_pages) > + if (nr_pages > max_pages) { > + xas_reset(xas); > break; > + } > > /* Has the page moved or been split? */ > - if (unlikely(folio != xas_reload(&xas))) { > + if (unlikely(folio != xas_reload(xas))) { > folio_put(folio); > + xas_reset(xas); > break; > } > > if (!folio_trylock(folio)) { > folio_put(folio); > + xas_reset(xas); > break; > } > - if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { > + if (!folio_test_dirty(folio) || > + folio_test_writeback(folio)) { > folio_unlock(folio); > folio_put(folio); > + xas_reset(xas); > break; > } > > max_pages -= nr_pages; > - psize = folio_size(folio); > - len += psize; > + len = folio_size(folio); > stop = false; > - if (max_pages <= 0 || len >= max_len || *_count <= 0) > - stop = true; > > index += nr_pages; > + *_count -= nr_pages; > + *_len += len; > + if (max_pages <= 0 || *_len >= max_len || *_count <= 0) > + stop = true; > + > if (!folio_batch_add(&batch, folio)) > break; > if (stop) > break; > } > > - if (!stop) > - xas_pause(&xas); > + xas_pause(xas); > rcu_read_unlock(); > > /* Now, if we obtained any pages, we can shift them to being > @@ -2712,16 +2722,12 @@ static void cifs_extend_writeback(struct address_space *mapping, > if (!folio_clear_dirty_for_io(folio)) > WARN_ON(1); > folio_start_writeback(folio); > - > - *_count -= folio_nr_pages(folio); > folio_unlock(folio); > } > > folio_batch_release(&batch); > cond_resched(); > } while (!stop); > - > - *_len = len; > } > > /* > @@ -2729,8 +2735,10 @@ static void cifs_extend_writeback(struct address_space *mapping, > */ > static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, > struct writeback_control *wbc, > + struct xa_state *xas, > struct folio *folio, > - loff_t start, loff_t end) > + unsigned long long start, > + unsigned long long end) > { > struct inode *inode = mapping->host; > struct TCP_Server_Info *server; > @@ -2739,17 +2747,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, > struct cifs_credits credits_on_stack; > struct cifs_credits *credits = &credits_on_stack; > struct cifsFileInfo *cfile = NULL; > - unsigned int xid, wsize, len; > - loff_t i_size = i_size_read(inode); > - size_t max_len; > + unsigned long long i_size = i_size_read(inode), max_len; > + unsigned int xid, wsize; > + size_t len; > long count = wbc->nr_to_write; > int rc; > > /* The folio should be locked, dirty and not undergoing writeback. */ > + if (!folio_clear_dirty_for_io(folio)) > + BUG(); > folio_start_writeback(folio); > > count -= folio_nr_pages(folio); > - len = folio_size(folio); > > xid = get_xid(); > server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); > @@ -2779,10 +2788,12 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, > wdata->server = server; > cfile = NULL; > > - /* Find all consecutive lockable dirty pages, stopping when we find a > - * page that is not immediately lockable, is not dirty or is missing, > - * or we reach the end of the range. > + /* Find all consecutive lockable dirty pages that have contiguous > + * written regions, stopping when we find a page that is not > + * immediately lockable, is not dirty or is missing, or we reach the > + * end of the range. > */ > + len = folio_size(folio); > if (start < i_size) { > /* Trim the write to the EOF; the extra data is ignored. Also > * put an upper limit on the size of a single storedata op. > @@ -2801,19 +2812,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, > max_pages -= folio_nr_pages(folio); > > if (max_pages > 0) > - cifs_extend_writeback(mapping, &count, start, > + cifs_extend_writeback(mapping, xas, &count, start, > max_pages, max_len, &len); > } > - len = min_t(loff_t, len, max_len); > } > - > - wdata->bytes = len; > + len = min_t(unsigned long long, len, i_size - start); > > /* We now have a contiguous set of dirty pages, each with writeback > * set; the first page is still locked at this point, but all the rest > * have been unlocked. > */ > folio_unlock(folio); > + wdata->bytes = len; > > if (start < i_size) { > iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages, > @@ -2864,102 +2874,118 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, > /* > * write a region of pages back to the server > */ > -static int cifs_writepages_region(struct address_space *mapping, > - struct writeback_control *wbc, > - loff_t start, loff_t end, loff_t *_next) > +static ssize_t cifs_writepages_begin(struct address_space *mapping, > + struct writeback_control *wbc, > + struct xa_state *xas, > + unsigned long long *_start, > + unsigned long long end) > { > - struct folio_batch fbatch; > + struct folio *folio; > + unsigned long long start = *_start; > + ssize_t ret; > int skips = 0; > > - folio_batch_init(&fbatch); > - do { > - int nr; > - pgoff_t index = start / PAGE_SIZE; > +search_again: > + /* Find the first dirty page. */ > + rcu_read_lock(); > > - nr = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, > - PAGECACHE_TAG_DIRTY, &fbatch); > - if (!nr) > + for (;;) { > + folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); > + if (xas_retry(xas, folio) || xa_is_value(folio)) > + continue; > + if (!folio) > break; > > - for (int i = 0; i < nr; i++) { > - ssize_t ret; > - struct folio *folio = fbatch.folios[i]; > + if (!folio_try_get_rcu(folio)) { > + xas_reset(xas); > + continue; > + } > > -redo_folio: > - start = folio_pos(folio); /* May regress with THPs */ > + if (unlikely(folio != xas_reload(xas))) { > + folio_put(folio); > + xas_reset(xas); > + continue; > + } > > - /* At this point we hold neither the i_pages lock nor the > - * page lock: the page may be truncated or invalidated > - * (changing page->mapping to NULL), or even swizzled > - * back from swapper_space to tmpfs file mapping > - */ > - if (wbc->sync_mode != WB_SYNC_NONE) { > - ret = folio_lock_killable(folio); > - if (ret < 0) > - goto write_error; > - } else { > - if (!folio_trylock(folio)) > - goto skip_write; > - } > + xas_pause(xas); > + break; > + } > + rcu_read_unlock(); > + if (!folio) > + return 0; > > - if (folio->mapping != mapping || > - !folio_test_dirty(folio)) { > - start += folio_size(folio); > - folio_unlock(folio); > - continue; > - } > + start = folio_pos(folio); /* May regress with THPs */ > > - if (folio_test_writeback(folio) || > - folio_test_fscache(folio)) { > - folio_unlock(folio); > - if (wbc->sync_mode == WB_SYNC_NONE) > - goto skip_write; > + /* At this point we hold neither the i_pages lock nor the page lock: > + * the page may be truncated or invalidated (changing page->mapping to > + * NULL), or even swizzled back from swapper_space to tmpfs file > + * mapping > + */ > +lock_again: > + if (wbc->sync_mode != WB_SYNC_NONE) { > + ret = folio_lock_killable(folio); > + if (ret < 0) > + return ret; > + } else { > + if (!folio_trylock(folio)) > + goto search_again; > + } > > - folio_wait_writeback(folio); > + if (folio->mapping != mapping || > + !folio_test_dirty(folio)) { > + start += folio_size(folio); > + folio_unlock(folio); > + goto search_again; > + } > + > + if (folio_test_writeback(folio) || > + folio_test_fscache(folio)) { > + folio_unlock(folio); > + if (wbc->sync_mode != WB_SYNC_NONE) { > + folio_wait_writeback(folio); > #ifdef CONFIG_CIFS_FSCACHE > - folio_wait_fscache(folio); > + folio_wait_fscache(folio); > #endif > - goto redo_folio; > - } > - > - if (!folio_clear_dirty_for_io(folio)) > - /* We hold the page lock - it should've been dirty. */ > - WARN_ON(1); > - > - ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end); > - if (ret < 0) > - goto write_error; > - > - start += ret; > - continue; > - > -write_error: > - folio_batch_release(&fbatch); > - *_next = start; > - return ret; > + goto lock_again; > + } > > -skip_write: > - /* > - * Too many skipped writes, or need to reschedule? > - * Treat it as a write error without an error code. > - */ > + start += folio_size(folio); > + if (wbc->sync_mode == WB_SYNC_NONE) { > if (skips >= 5 || need_resched()) { > ret = 0; > - goto write_error; > + goto out; > } > - > - /* Otherwise, just skip that folio and go on to the next */ > skips++; > - start += folio_size(folio); > - continue; > } > + goto search_again; > + } > > - folio_batch_release(&fbatch); > - cond_resched(); > - } while (wbc->nr_to_write > 0); > + ret = cifs_write_back_from_locked_folio(mapping, wbc, xas, folio, start, end); > +out: > + if (ret > 0) > + *_start = start + ret; > + return ret; > +} > > - *_next = start; > - return 0; > +/* > + * Write a region of pages back to the server > + */ > +static int cifs_writepages_region(struct address_space *mapping, > + struct writeback_control *wbc, > + unsigned long long *_start, > + unsigned long long end) > +{ > + ssize_t ret; > + > + XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); > + > + do { > + ret = cifs_writepages_begin(mapping, wbc, &xas, _start, end); > + if (ret > 0 && wbc->nr_to_write > 0) > + cond_resched(); > + } while (ret > 0 && wbc->nr_to_write > 0); > + > + return ret > 0 ? 0 : ret; > } > > /* > @@ -2968,7 +2994,7 @@ static int cifs_writepages_region(struct address_space *mapping, > static int cifs_writepages(struct address_space *mapping, > struct writeback_control *wbc) > { > - loff_t start, next; > + loff_t start, end; > int ret; > > /* We have to be careful as we can end up racing with setattr() > @@ -2976,28 +3002,34 @@ static int cifs_writepages(struct address_space *mapping, > * to prevent it. > */ > > - if (wbc->range_cyclic) { > + if (wbc->range_cyclic && mapping->writeback_index) { > start = mapping->writeback_index * PAGE_SIZE; > - ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); > - if (ret == 0) { > - mapping->writeback_index = next / PAGE_SIZE; > - if (start > 0 && wbc->nr_to_write > 0) { > - ret = cifs_writepages_region(mapping, wbc, 0, > - start, &next); > - if (ret == 0) > - mapping->writeback_index = > - next / PAGE_SIZE; > - } > + ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX); > + if (ret < 0) > + goto out; > + > + if (wbc->nr_to_write <= 0) { > + mapping->writeback_index = start / PAGE_SIZE; > + goto out; > } > + > + start = 0; > + end = mapping->writeback_index * PAGE_SIZE; > + mapping->writeback_index = 0; > + ret = cifs_writepages_region(mapping, wbc, &start, end); > + if (ret == 0) > + mapping->writeback_index = start / PAGE_SIZE; > } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { > - ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); > + start = 0; > + ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX); > if (wbc->nr_to_write > 0 && ret == 0) > - mapping->writeback_index = next / PAGE_SIZE; > + mapping->writeback_index = start / PAGE_SIZE; > } else { > - ret = cifs_writepages_region(mapping, wbc, > - wbc->range_start, wbc->range_end, &next); > + start = wbc->range_start; > + ret = cifs_writepages_region(mapping, wbc, &start, wbc->range_end); > } > > +out: > return ret; > } > > -- Thanks, Steve