On Tue, 21 Oct 2008 19:09:48 +1100 npiggin@xxxxxxx wrote: > In write_cache_pages, if ret signals a real error, but we still have some pages > left in the pagevec, done would be set to 1, but the remaining pages would > continue to be processed and ret will be overwritten in the process. It could > easily be overwritten with success, and thus success will be returned even if > there is an error. Thus the caller is told all writes succeeded, wheras in > reality some did not. > > Fix this by bailing immediately if there is an error, and retaining the first > error code. > > This is a data interity bug. > > Signed-off-by: Nick Piggin <npiggin@xxxxxxx> > --- > Index: linux-2.6/mm/page-writeback.c > =================================================================== > --- linux-2.6.orig/mm/page-writeback.c > +++ linux-2.6/mm/page-writeback.c > @@ -931,12 +931,16 @@ retry: > } > > ret = (*writepage)(page, wbc, data); > - > - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { > - unlock_page(page); > - ret = 0; > - } > - if (ret || (--nr_to_write <= 0)) > + if (unlikely(ret)) { > + if (ret == AOP_WRITEPAGE_ACTIVATE) { > + unlock_page(page); > + ret = 0; > + } else { > + done = 1; > + break; > + } > + } > + if (--nr_to_write <= 0) > done = 1; > if (wbc->nonblocking && bdi_write_congested(bdi)) { > wbc->encountered_congestion = 1; > > -- I don't think I like the implementation much. In all cases in that function where we set done=1, we want to bale out right now at this page, rather than processing the remaining pages in the pagevec. So it would be better to implement a bit of code which releases the pagevec pages and then breaks out of the loop. Then this bug automatically gets fixed. Like this: --- a/mm/page-writeback.c~mm-write_cache_pages-writepage-error-fix +++ a/mm/page-writeback.c @@ -865,7 +865,6 @@ int write_cache_pages(struct address_spa { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; - int done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; @@ -891,10 +890,10 @@ int write_cache_pages(struct address_spa scanned = 1; } retry: - while (!done && (index <= end) && + while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -903,9 +902,9 @@ retry: /* * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file + * lock on the page itself: the page may be truncated + * or invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file * mapping */ lock_page(page); @@ -916,8 +915,8 @@ retry: } if (!wbc->range_cyclic && page->index > end) { - done = 1; unlock_page(page); + goto bale; continue; } @@ -937,16 +936,16 @@ retry: ret = 0; } if (ret || (--nr_to_write <= 0)) - done = 1; + goto bale; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; - done = 1; + goto bale; } } pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -961,7 +960,11 @@ retry: wbc->nr_to_write = nr_to_write; } +out: return ret; +bale: + pagevec_release(&pvec); + goto out; } EXPORT_SYMBOL(write_cache_pages); _ Which yields this: int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; return 0; } pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; scanned = 1; } retry: while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated * or invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping */ lock_page(page); if (unlikely(page->mapping != mapping)) { unlock_page(page); continue; } if (!wbc->range_cyclic && page->index > end) { unlock_page(page); goto bale; continue; } if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (PageWriteback(page) || !clear_page_dirty_for_io(page)) { unlock_page(page); continue; } ret = (*writepage)(page, wbc, data); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); ret = 0; } if (ret || (--nr_to_write <= 0)) goto bale; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; goto bale; } } pagevec_release(&pvec); cond_resched(); } if (!scanned) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file */ scanned = 1; index = 0; goto retry; } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) mapping->writeback_index = index; wbc->nr_to_write = nr_to_write; } out: return ret; bale: pagevec_release(&pvec); goto out; } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html