Re: [patch 1/7] mm: write_cache_pages writepage error fix

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 21, 2008 at 02:55:18PM -0700, Andrew Morton wrote:
> On Tue, 21 Oct 2008 19:09:48 +1100
> npiggin@xxxxxxx wrote:
> 
> > In write_cache_pages, if ret signals a real error, but we still have some pages
> > left in the pagevec, done would be set to 1, but the remaining pages would
> > continue to be processed and ret will be overwritten in the process. It could
> > easily be overwritten with success, and thus success will be returned even if
> > there is an error. Thus the caller is told all writes succeeded, wheras in
> > reality some did not.
> > 
> > Fix this by bailing immediately if there is an error, and retaining the first
> > error code.
> > 
> > This is a data interity bug.
> > 
> > Signed-off-by: Nick Piggin <npiggin@xxxxxxx>
> > ---
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c
> > +++ linux-2.6/mm/page-writeback.c
> > @@ -931,12 +931,16 @@ retry:
> >  			}
> >  
> >  			ret = (*writepage)(page, wbc, data);
> > -
> > -			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
> > -				unlock_page(page);
> > -				ret = 0;
> > -			}
> > -			if (ret || (--nr_to_write <= 0))
> > +			if (unlikely(ret)) {
> > +				if (ret == AOP_WRITEPAGE_ACTIVATE) {
> > +					unlock_page(page);
> > +					ret = 0;
> > +				} else {
> > +					done = 1;
> > +					break;
> > +				}
> > + 			}
> > +			if (--nr_to_write <= 0)
> >  				done = 1;
> >  			if (wbc->nonblocking && bdi_write_congested(bdi)) {
> >  				wbc->encountered_congestion = 1;
> > 
> > -- 
> 
> I don't think I like the implementation much.
> 
> In all cases in that function where we set done=1, we want to bale out
> right now at this page, rather than processing the remaining pages in
> the pagevec.
> 
> So it would be better to implement a bit of code which releases the
> pagevec pages and then breaks out of the loop.  Then this bug
> automatically gets fixed.

I had almost exactly that at some point, then I rethought it because it
was a bigger behaviour change than the bugfix-only.

I thought the !done thing must be just for batching and to process the
pages already looked up in the pagevec.

I'll need to backport at least some of these to stable -- any chance of
making the change you suggest toward the end of the patchset? Let me
know if you want a patch or will do one yourself.


> 
> Like this:
> 
> --- a/mm/page-writeback.c~mm-write_cache_pages-writepage-error-fix
> +++ a/mm/page-writeback.c
> @@ -865,7 +865,6 @@ int write_cache_pages(struct address_spa
>  {
>  	struct backing_dev_info *bdi = mapping->backing_dev_info;
>  	int ret = 0;
> -	int done = 0;
>  	struct pagevec pvec;
>  	int nr_pages;
>  	pgoff_t index;
> @@ -891,10 +890,10 @@ int write_cache_pages(struct address_spa
>  		scanned = 1;
>  	}
>  retry:
> -	while (!done && (index <= end) &&
> +	while ((index <= end) &&
>  	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> -					      PAGECACHE_TAG_DIRTY,
> -					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
> +				PAGECACHE_TAG_DIRTY,
> +				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
>  		unsigned i;
>  
>  		scanned = 1;
> @@ -903,9 +902,9 @@ retry:
>  
>  			/*
>  			 * At this point we hold neither mapping->tree_lock nor
> -			 * lock on the page itself: the page may be truncated or
> -			 * invalidated (changing page->mapping to NULL), or even
> -			 * swizzled back from swapper_space to tmpfs file
> +			 * lock on the page itself: the page may be truncated
> +			 * or invalidated (changing page->mapping to NULL), or
> +			 * even swizzled back from swapper_space to tmpfs file
>  			 * mapping
>  			 */
>  			lock_page(page);
> @@ -916,8 +915,8 @@ retry:
>  			}
>  
>  			if (!wbc->range_cyclic && page->index > end) {
> -				done = 1;
>  				unlock_page(page);
> +				goto bale;
>  				continue;
>  			}
>  
> @@ -937,16 +936,16 @@ retry:
>  				ret = 0;
>  			}
>  			if (ret || (--nr_to_write <= 0))
> -				done = 1;
> +				goto bale;
>  			if (wbc->nonblocking && bdi_write_congested(bdi)) {
>  				wbc->encountered_congestion = 1;
> -				done = 1;
> +				goto bale;
>  			}
>  		}
>  		pagevec_release(&pvec);
>  		cond_resched();
>  	}
> -	if (!scanned && !done) {
> +	if (!scanned) {
>  		/*
>  		 * We hit the last page and there is more work to be done: wrap
>  		 * back to the start of the file
> @@ -961,7 +960,11 @@ retry:
>  		wbc->nr_to_write = nr_to_write;
>  	}
>  
> +out:
>  	return ret;
> +bale:
> +	pagevec_release(&pvec);
> +	goto out;
>  }
>  EXPORT_SYMBOL(write_cache_pages);
>  
> _
> 
> 
> Which yields this:
> 
> int write_cache_pages(struct address_space *mapping,
> 		      struct writeback_control *wbc, writepage_t writepage,
> 		      void *data)
> {
> 	struct backing_dev_info *bdi = mapping->backing_dev_info;
> 	int ret = 0;
> 	struct pagevec pvec;
> 	int nr_pages;
> 	pgoff_t index;
> 	pgoff_t end;		/* Inclusive */
> 	int scanned = 0;
> 	int range_whole = 0;
> 	long nr_to_write = wbc->nr_to_write;
> 
> 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
> 		wbc->encountered_congestion = 1;
> 		return 0;
> 	}
> 
> 	pagevec_init(&pvec, 0);
> 	if (wbc->range_cyclic) {
> 		index = mapping->writeback_index; /* Start from prev offset */
> 		end = -1;
> 	} else {
> 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
> 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
> 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> 			range_whole = 1;
> 		scanned = 1;
> 	}
> retry:
> 	while ((index <= end) &&
> 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> 				PAGECACHE_TAG_DIRTY,
> 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
> 		unsigned i;
> 
> 		scanned = 1;
> 		for (i = 0; i < nr_pages; i++) {
> 			struct page *page = pvec.pages[i];
> 
> 			/*
> 			 * At this point we hold neither mapping->tree_lock nor
> 			 * lock on the page itself: the page may be truncated
> 			 * or invalidated (changing page->mapping to NULL), or
> 			 * even swizzled back from swapper_space to tmpfs file
> 			 * mapping
> 			 */
> 			lock_page(page);
> 
> 			if (unlikely(page->mapping != mapping)) {
> 				unlock_page(page);
> 				continue;
> 			}
> 
> 			if (!wbc->range_cyclic && page->index > end) {
> 				unlock_page(page);
> 				goto bale;
> 				continue;
> 			}
> 
> 			if (wbc->sync_mode != WB_SYNC_NONE)
> 				wait_on_page_writeback(page);
> 
> 			if (PageWriteback(page) ||
> 			    !clear_page_dirty_for_io(page)) {
> 				unlock_page(page);
> 				continue;
> 			}
> 
> 			ret = (*writepage)(page, wbc, data);
> 
> 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
> 				unlock_page(page);
> 				ret = 0;
> 			}
> 			if (ret || (--nr_to_write <= 0))
> 				goto bale;
> 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
> 				wbc->encountered_congestion = 1;
> 				goto bale;
> 			}
> 		}
> 		pagevec_release(&pvec);
> 		cond_resched();
> 	}
> 	if (!scanned) {
> 		/*
> 		 * We hit the last page and there is more work to be done: wrap
> 		 * back to the start of the file
> 		 */
> 		scanned = 1;
> 		index = 0;
> 		goto retry;
> 	}
> 	if (!wbc->no_nrwrite_index_update) {
> 		if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
> 			mapping->writeback_index = index;
> 		wbc->nr_to_write = nr_to_write;
> 	}
> 
> out:
> 	return ret;
> bale:
> 	pagevec_release(&pvec);
> 	goto out;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux