Re: [patch 1/7] mm: write_cache_pages writepage error fix

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 21 Oct 2008 19:09:48 +1100
npiggin@xxxxxxx wrote:

> In write_cache_pages, if ret signals a real error, but we still have some pages
> left in the pagevec, done would be set to 1, but the remaining pages would
> continue to be processed and ret will be overwritten in the process. It could
> easily be overwritten with success, and thus success will be returned even if
> there is an error. Thus the caller is told all writes succeeded, wheras in
> reality some did not.
> 
> Fix this by bailing immediately if there is an error, and retaining the first
> error code.
> 
> This is a data interity bug.
> 
> Signed-off-by: Nick Piggin <npiggin@xxxxxxx>
> ---
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c
> +++ linux-2.6/mm/page-writeback.c
> @@ -931,12 +931,16 @@ retry:
>  			}
>  
>  			ret = (*writepage)(page, wbc, data);
> -
> -			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
> -				unlock_page(page);
> -				ret = 0;
> -			}
> -			if (ret || (--nr_to_write <= 0))
> +			if (unlikely(ret)) {
> +				if (ret == AOP_WRITEPAGE_ACTIVATE) {
> +					unlock_page(page);
> +					ret = 0;
> +				} else {
> +					done = 1;
> +					break;
> +				}
> + 			}
> +			if (--nr_to_write <= 0)
>  				done = 1;
>  			if (wbc->nonblocking && bdi_write_congested(bdi)) {
>  				wbc->encountered_congestion = 1;
> 
> -- 

I don't think I like the implementation much.

In all cases in that function where we set done=1, we want to bale out
right now at this page, rather than processing the remaining pages in
the pagevec.

So it would be better to implement a bit of code which releases the
pagevec pages and then breaks out of the loop.  Then this bug
automatically gets fixed.

Like this:

--- a/mm/page-writeback.c~mm-write_cache_pages-writepage-error-fix
+++ a/mm/page-writeback.c
@@ -865,7 +865,6 @@ int write_cache_pages(struct address_spa
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
-	int done = 0;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
@@ -891,10 +890,10 @@ int write_cache_pages(struct address_spa
 		scanned = 1;
 	}
 retry:
-	while (!done && (index <= end) &&
+	while ((index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+				PAGECACHE_TAG_DIRTY,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -903,9 +902,9 @@ retry:
 
 			/*
 			 * At this point we hold neither mapping->tree_lock nor
-			 * lock on the page itself: the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or even
-			 * swizzled back from swapper_space to tmpfs file
+			 * lock on the page itself: the page may be truncated
+			 * or invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
 			lock_page(page);
@@ -916,8 +915,8 @@ retry:
 			}
 
 			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
 				unlock_page(page);
+				goto bale;
 				continue;
 			}
 
@@ -937,16 +936,16 @@ retry:
 				ret = 0;
 			}
 			if (ret || (--nr_to_write <= 0))
-				done = 1;
+				goto bale;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
-				done = 1;
+				goto bale;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
-	if (!scanned && !done) {
+	if (!scanned) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
@@ -961,7 +960,11 @@ retry:
 		wbc->nr_to_write = nr_to_write;
 	}
 
+out:
 	return ret;
+bale:
+	pagevec_release(&pvec);
+	goto out;
 }
 EXPORT_SYMBOL(write_cache_pages);
 
_


Which yields this:

int write_cache_pages(struct address_space *mapping,
		      struct writeback_control *wbc, writepage_t writepage,
		      void *data)
{
	struct backing_dev_info *bdi = mapping->backing_dev_info;
	int ret = 0;
	struct pagevec pvec;
	int nr_pages;
	pgoff_t index;
	pgoff_t end;		/* Inclusive */
	int scanned = 0;
	int range_whole = 0;
	long nr_to_write = wbc->nr_to_write;

	if (wbc->nonblocking && bdi_write_congested(bdi)) {
		wbc->encountered_congestion = 1;
		return 0;
	}

	pagevec_init(&pvec, 0);
	if (wbc->range_cyclic) {
		index = mapping->writeback_index; /* Start from prev offset */
		end = -1;
	} else {
		index = wbc->range_start >> PAGE_CACHE_SHIFT;
		end = wbc->range_end >> PAGE_CACHE_SHIFT;
		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
			range_whole = 1;
		scanned = 1;
	}
retry:
	while ((index <= end) &&
	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
				PAGECACHE_TAG_DIRTY,
				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
		unsigned i;

		scanned = 1;
		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/*
			 * At this point we hold neither mapping->tree_lock nor
			 * lock on the page itself: the page may be truncated
			 * or invalidated (changing page->mapping to NULL), or
			 * even swizzled back from swapper_space to tmpfs file
			 * mapping
			 */
			lock_page(page);

			if (unlikely(page->mapping != mapping)) {
				unlock_page(page);
				continue;
			}

			if (!wbc->range_cyclic && page->index > end) {
				unlock_page(page);
				goto bale;
				continue;
			}

			if (wbc->sync_mode != WB_SYNC_NONE)
				wait_on_page_writeback(page);

			if (PageWriteback(page) ||
			    !clear_page_dirty_for_io(page)) {
				unlock_page(page);
				continue;
			}

			ret = (*writepage)(page, wbc, data);

			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
				unlock_page(page);
				ret = 0;
			}
			if (ret || (--nr_to_write <= 0))
				goto bale;
			if (wbc->nonblocking && bdi_write_congested(bdi)) {
				wbc->encountered_congestion = 1;
				goto bale;
			}
		}
		pagevec_release(&pvec);
		cond_resched();
	}
	if (!scanned) {
		/*
		 * We hit the last page and there is more work to be done: wrap
		 * back to the start of the file
		 */
		scanned = 1;
		index = 0;
		goto retry;
	}
	if (!wbc->no_nrwrite_index_update) {
		if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
			mapping->writeback_index = index;
		wbc->nr_to_write = nr_to_write;
	}

out:
	return ret;
bale:
	pagevec_release(&pvec);
	goto out;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux