Hi Dave, On Fri, Jan 14, 2011 at 03:43:34PM -0600, bpm@xxxxxxx wrote: > On Fri, Jan 14, 2011 at 11:29:00AM +1100, Dave Chinner wrote: > > I'm sure there are other ways to solve these problems, but these two > > are the ones that come to mind for me. I'm open to other solutions > > or ways to improve on these ones, especially if they are simpler. ;) > > Anyone got any ideas or improvements? > > The direction I've been taking is to use XFS_BMAPI_EXACT in > *xfs_iomap_write_allocate to ensure that an extent covering exactly the > pages we're prepared to write out immediately is allocated and the rest > of the delalloc extent is left as is. This exercises some of the btree > code more heavily and led to the discovery of the > allocate-in-the-middle-of-a-delalloc-extent bug. It also presents a > performance issue which I've tried to resolve by extending > xfs_probe_cluster to probe delalloc extents-- lock up all of the pages > to be converted before performing the allocation and hold those locks > until they are submitted for writeback. It's not very pretty but it > resolves the corruption. Here's the xfs_page_state_convert side of the patch so you can get an idea what am trying for, and how ugly it is. ;^) I have not ported the xfs_iomap_write_allocate bits yet. It is against an older version of xfs... but you get the idea. -Ben Index: sles11-src/fs/xfs/linux-2.6/xfs_aops.c =================================================================== --- sles11-src.orig/fs/xfs/linux-2.6/xfs_aops.c +++ sles11-src/fs/xfs/linux-2.6/xfs_aops.c @@ -585,41 +585,61 @@ STATIC unsigned int xfs_probe_page( struct page *page, unsigned int pg_offset, - int mapped) + int mapped, + unsigned int type, + int *entire) { + struct buffer_head *bh, *head; int ret = 0; if (PageWriteback(page)) return 0; + if (!page->mapping) + return 0; + if (!PageDirty(page)) + return 0; + if (!page_has_buffers(page)) + return 0; - if (page->mapping && PageDirty(page)) { - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (mapped != buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - } else - ret = mapped ? 0 : PAGE_CACHE_SIZE; - } + *entire = 0; + bh = head = page_buffers(page); + do { + if (!buffer_uptodate(bh)) + break; + if (buffer_mapped(bh) != mapped) + break; + if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh)) + break; + if (type == IOMAP_DELAY && !buffer_delay(bh)) + break; + if (type == IOMAP_NEW && !buffer_dirty(bh)) + break; + + ret += bh->b_size; + + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); + + if (bh == head) + *entire = 1; return ret; } +#define MAX_WRITEBACK_PAGES 1024 + STATIC size_t xfs_probe_cluster( struct inode *inode, struct page *startpage, struct buffer_head *bh, struct buffer_head *head, - int mapped) + int mapped, + unsigned int type, + struct page **pages, + int *pagecount, + struct writeback_control *wbc) { struct pagevec pvec; pgoff_t tindex, tlast, tloff; @@ -628,8 +648,15 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) + if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) { + return total; + } else if (type == IOMAP_UNWRITTEN && !buffer_unwritten(bh)) { return total; + } else if (type == IOMAP_DELAY && !buffer_delay(bh)) { + return total; + } else if (type == IOMAP_NEW && !buffer_dirty(bh)) { + return total; + } total += bh->b_size; } while ((bh = bh->b_this_page) != head); @@ -637,8 +664,9 @@ xfs_probe_cluster( tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; tindex = startpage->index + 1; - /* Prune this back to avoid pathological behavior */ - tloff = min(tlast, startpage->index + 64); + /* Prune this back to avoid pathological behavior, subtract 1 for the + * first page. */ + tloff = min(tlast, startpage->index + (pgoff_t)MAX_WRITEBACK_PAGES); pagevec_init(&pvec, 0); while (!done && tindex <= tloff) { @@ -647,10 +675,10 @@ xfs_probe_cluster( if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) break; - for (i = 0; i < pagevec_count(&pvec); i++) { + for (i = 0; i < pagevec_count(&pvec) && !done; i++) { struct page *page = pvec.pages[i]; size_t pg_offset, pg_len = 0; - + int entire = 0; if (tindex == tlast) { pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1); @@ -658,20 +686,39 @@ xfs_probe_cluster( done = 1; break; } - } else + } else { pg_offset = PAGE_CACHE_SIZE; - - if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset, mapped); - unlock_page(page); } + if (page->index == tindex && + *pagecount < MAX_WRITEBACK_PAGES - 1 && + trylock_page(page)) { + pg_len = xfs_probe_page(page, pg_offset, + mapped, type, &entire); + if (pg_len) { + pages[(*pagecount)++] = page; + + } else { + unlock_page(page); + } + } if (!pg_len) { done = 1; break; } total += pg_len; + + /* + * if probe did not succeed on all buffers in the page + * we don't want to probe subsequent pages. This + * ensures that we don't have a mix of buffer types in + * the iomap. + */ + if (!entire) { + done = 1; + break; + } tindex++; } @@ -683,56 +730,19 @@ xfs_probe_cluster( } /* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. - */ -STATIC int -xfs_is_delayed_page( - struct page *page, - unsigned int type) -{ - if (PageWriteback(page)) - return 0; - - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; - - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); - else - break; - } while ((bh = bh->b_this_page) != head); - - if (acceptable) - return 1; - } - - return 0; -} - -/* * Allocate & map buffers for page given the extent map. Write it out. * except for the original page of a writepage, this is called on * delalloc/unwritten pages only, for the original page it is possible * that the page has no mapping at all. */ -STATIC int +STATIC void xfs_convert_page( struct inode *inode, struct page *page, - loff_t tindex, xfs_iomap_t *mp, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, - int all_bh) + int startio) { struct buffer_head *bh, *head; xfs_off_t end_offset; @@ -740,20 +750,9 @@ xfs_convert_page( unsigned int type; int bbits = inode->i_blkbits; int len, page_dirty; - int count = 0, done = 0, uptodate = 1; + int count = 0, uptodate = 1; xfs_off_t offset = page_offset(page); - if (page->index != tindex) - goto fail; - if (! trylock_page(page)) - goto fail; - if (PageWriteback(page)) - goto fail_unlock_page; - if (page->mapping != inode->i_mapping) - goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) - goto fail_unlock_page; - /* * page_dirty is initially a count of buffers on the page before * EOF and is decremented as we move each into a cleanable state. @@ -770,6 +769,8 @@ xfs_convert_page( end_offset = min_t(unsigned long long, (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + end_offset = min_t(unsigned long long, end_offset, + (mp->iomap_offset + mp->iomap_bsize)); len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), @@ -779,12 +780,12 @@ xfs_convert_page( bh = head = page_buffers(page); do { - if (offset >= end_offset) + if (offset >= end_offset) { break; + } if (!buffer_uptodate(bh)) uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { - done = 1; continue; } @@ -794,10 +795,7 @@ xfs_convert_page( else type = IOMAP_DELAY; - if (!xfs_iomap_valid(mp, offset)) { - done = 1; - continue; - } + BUG_ON(!xfs_iomap_valid(mp, offset)); ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); @@ -805,7 +803,7 @@ xfs_convert_page( xfs_map_at_offset(bh, offset, bbits, mp); if (startio) { xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); + type, ioendp, 0 /* !done */); } else { set_buffer_dirty(bh); unlock_buffer(bh); @@ -814,15 +812,14 @@ xfs_convert_page( page_dirty--; count++; } else { + WARN_ON(!xfs_iomap_valid(mp, offset)); type = IOMAP_NEW; - if (buffer_mapped(bh) && all_bh && startio) { + if (buffer_mapped(bh) && buffer_dirty(bh) && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); + type, ioendp, 0 /* !done */); count++; page_dirty--; - } else { - done = 1; } } } while (offset += len, (bh = bh->b_this_page) != head); @@ -838,19 +835,16 @@ xfs_convert_page( wbc->nr_to_write--; if (bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; - done = 1; } else if (wbc->nr_to_write <= 0) { - done = 1; + /* XXX ignore nr_to_write + done = 1; */ } } + /* unlocks page */ xfs_start_page_writeback(page, wbc, !page_dirty, count); + } else { + unlock_page(page); } - - return done; - fail_unlock_page: - unlock_page(page); - fail: - return 1; } /* @@ -860,33 +854,17 @@ xfs_convert_page( STATIC void xfs_cluster_write( struct inode *inode, - pgoff_t tindex, + struct page **pages, + int pagecount, xfs_iomap_t *iomapp, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, - int all_bh, - pgoff_t tlast) + int startio) { - struct pagevec pvec; - int done = 0, i; - - pagevec_init(&pvec, 0); - while (!done && tindex <= tlast) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); - if (done) - break; - } + int i; - pagevec_release(&pvec); - cond_resched(); + for (i = 0; i < pagecount; i++) { + xfs_convert_page(inode, pages[i], iomapp, ioendp, wbc, startio); } } @@ -908,7 +886,6 @@ xfs_cluster_write( * bh->b_states's will not agree and only ones setup by BPW/BCW will have * valid state, thus the whole page must be written out thing. */ - STATIC int xfs_page_state_convert( struct inode *inode, @@ -924,12 +901,13 @@ xfs_page_state_convert( unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; int page_dirty, count = 0; int trylock = 0; - int all_bh = unmapped; + struct page **pages; + int pagecount = 0, i; if (startio) { if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) @@ -949,6 +927,9 @@ xfs_page_state_convert( } } + pages = kmem_zalloc(sizeof(struct page*) * MAX_WRITEBACK_PAGES, KM_NOFS); + + /* * page_dirty is initially a count of buffers on the page before * EOF and is decremented as we move each into a cleanable state. @@ -1036,14 +1017,23 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { - size = xfs_probe_cluster(inode, - page, bh, head, 0); - } else { - size = len; + size = 0; + if (type == IOMAP_NEW && !pagecount) { + size = xfs_probe_cluster(inode, page, + bh, head, + 0 /* !mapped */, type, + pages, &pagecount, wbc); + } else if ((type == IOMAP_DELAY || + type == IOMAP_UNWRITTEN) && + !pagecount) { + size = xfs_probe_cluster(inode, page, + bh, head, + 1 /* mapped */, type, + pages, &pagecount, wbc); } - err = xfs_map_blocks(inode, offset, size, + err = xfs_map_blocks(inode, offset, + size ? size : len, &iomap, flags); if (err) goto error; @@ -1072,9 +1062,16 @@ xfs_page_state_convert( */ if (!iomap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, - head, 1); - err = xfs_map_blocks(inode, offset, size, + size = 0; + if (!pagecount) { + size = xfs_probe_cluster(inode, page, + bh, head, + 1 /* mapped */, + IOMAP_NEW, + pages, &pagecount, wbc); + } + err = xfs_map_blocks(inode, offset, + size ? size : len, &iomap, flags); if (err) goto error; @@ -1092,8 +1089,6 @@ xfs_page_state_convert( type = IOMAP_NEW; if (!test_and_set_bit(BH_Lock, &bh->b_state)) { ASSERT(buffer_mapped(bh)); - if (iomap_valid) - all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, &ioend, !iomap_valid); page_dirty--; @@ -1104,6 +1099,8 @@ xfs_page_state_convert( } else if ((buffer_uptodate(bh) || PageUptodate(page)) && (unmapped || startio)) { iomap_valid = 0; + } else { + WARN_ON(1); } if (!iohead) @@ -1117,13 +1114,11 @@ xfs_page_state_convert( if (startio) xfs_start_page_writeback(page, wbc, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); + if (ioend && iomap_valid && pagecount) { + xfs_cluster_write(inode, pages, pagecount, &iomap, &ioend, + wbc, startio); } + kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES); if (iohead) xfs_submit_ioend(iohead); @@ -1133,7 +1128,12 @@ xfs_page_state_convert( error: if (iohead) xfs_cancel_ioend(iohead); - + if (pages) { + for (i = 0; i < pagecount; i++) { + unlock_page(pages[i]); + } + kmem_free(pages, sizeof(struct page *) * MAX_WRITEBACK_PAGES); + } return err; } _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs