Hello, This is a revised patch according to Christoph's comments at V4. Changes to V5: -------------- * Revise xfs_has_unwritten_buffer() to lookup pages match tag. * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages. * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)). Tests: ------ seek sanity tester: http://patchwork.xfs.org/patch/3108/ seek copy tester: http://patchwork.xfs.org/patch/3109/ Thanks, -Jeff Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx> --- fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 465 insertions(+), 1 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 753ed9b..24ae40a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include <linux/dcache.h> #include <linux/falloc.h> +#include <linux/pagevec.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +/* + * Probe the data buffer offset in page cache for unwritten extents. + * Fetch all the pages match @tag, and iterate each page to find out + * if a buffer head state has BH_Unwritten or BH_Uptodate set. + */ +STATIC bool +xfs_has_unwritten_buffer( + struct inode *inode, + struct xfs_bmbt_irec *map, + int tag, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + bool found = false; + + pagevec_init(&pvec, 0); + + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT; + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount) + >> PAGE_CACHE_SHIFT; + + do { + unsigned int i; + unsigned nr_pages; + int want = min_t(pgoff_t, end - index, + (pgoff_t)PAGEVEC_SIZE - 1) + 1; + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping, + &index, tag, want); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh; + struct buffer_head *head; + xfs_fileoff_t last; + + if (!page_has_buffers(page)) + continue; + + /* + * There is no need to check the following pages + * if the current page offset is out of range. + */ + if (page->index > end) + goto out; + + last = XFS_B_TO_FSBT(mp, + page->index << PAGE_CACHE_SHIFT); + + bh = head = page_buffers(page); + do { + /* + * An extent in XFS_EXT_UNWRITTEN have disk + * blocks already mapped to it, but no data + * has been committed to them yet. If it has + * dirty data in the page cache it can be + * identified by having BH_Unwritten set in + * each buffer. Also, the buffer head state + * might be in BH_Uptodate too if the buffer + * writeback procedure was fired, we need to + * examine it as well. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + found = true; + *offset = XFS_FSB_TO_B(mp, last); + goto out; + } + last++; + } while ((bh = bh->b_this_page) != head); + } + + /* + * If the number of probed pages less than our desired, + * there should no more pages mapped, search done. + */ + if (nr_pages < want) + break; + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index < end); + +out: + pagevec_release(&pvec); + if (!found) + *offset = 0; + + return found; +} + +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t isize = i_size_read(inode); + loff_t offset = 0; + struct xfs_ifork *ifp; + xfs_fileoff_t fsbno; + xfs_filblks_t len; + int lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + if (start >= isize) { + error = ENXIO; + goto out_lock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + len = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t seekoff; + + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_lock; + + /* No extents at given offset, must be beyond EOF */ + if (!nmap) { + error = ENXIO; + goto out_lock; + } + + seekoff = XFS_FSB_TO_B(mp, fsbno); + /* + * Landed in a hole, skip to check the next extent. + * If the next extent landed in an in-memory data extent, + * or it is a normal extent, its fine to return. + * If the next extent landed in a hole extent, calculate + * the start file system block number for the next scan. + * If the next extent landed in an unwritten extent, we + * need to lookup the page cache to examine the data + * buffer offset, if nothing found, treat it as a hole + * extent too. + */ + if (map[0].br_startblock == HOLESTARTBLOCK) { + /* + * Return ENXIO if no data extent behind + * the given offset. In this case, the seek + * offset should be landed in a hole. + */ + if (nmap == 1) { + error = ENXIO; + break; + } + + if (map[1].br_state == XFS_EXT_NORM || + map[1].br_startblock == DELAYSTARTBLOCK) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + + break; + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + offset = max_t(loff_t, seekoff, offset); + break; + } + } + + fsbno = map[1].br_startoff + map[1].br_blockcount; + } + + /* + * Landed in an unwritten extent, try to find out the data + * buffer offset from page cache firstly. If nothing was + * found, treat it as a hole, and skip to check the next + * extent, something just like above. + */ + if (map[0].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[0], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[0], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + offset = max_t(loff_t, seekoff, offset); + break; + } + + /* No data extent at the given offset */ + if (nmap == 1) { + error = ENXIO; + break; + } + + if (map[1].br_state == XFS_EXT_NORM || + map[1].br_startblock == DELAYSTARTBLOCK) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + break; + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + offset = max_t(loff_t, seekoff, offset); + break; + } + } + + fsbno = map[1].br_startoff + map[1].br_blockcount; + } + + /* Landed in a delay allocated extent or a real data extent */ + if (map[0].br_startblock == DELAYSTARTBLOCK || + map[0].br_state == XFS_EXT_NORM) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + break; + } + + /* Return ENXIO if beyond eof */ + if (XFS_FSB_TO_B(mp, fsbno) > isize) { + error = ENXIO; + goto out_lock; + } + } + + if (offset < start) + offset = start; + + if (offset != file->f_pos) + file->f_pos = offset; + +out_lock: + xfs_iunlock_map_shared(ip, lock); + if (error) + return -error; + + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t isize = i_size_read(inode); + loff_t offset = 0; + struct xfs_ifork *ifp; + xfs_fileoff_t fsbno; + xfs_filblks_t len; + int lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + if (start >= isize) { + error = ENXIO; + goto out_lock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + len = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t seekoff; + + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_lock; + + /* No extents at given offset, must be beyond EOF */ + if (!nmap) { + error = ENXIO; + goto out_lock; + } + + seekoff = XFS_FSB_TO_B(mp, fsbno); + /* + * Landed in an unwritten extent, try to lookup the page + * cache to find out if there is dirty data or not. If + * nothing was found, treate it as a hole. If there has + * dirty data and its offset starts past both the start + * block of the map and the current seek offset, it should + * be treated as hole too. Otherwise, go through the next + * extent to fetch holes. + */ + if (map[0].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[0], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[0], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + if (offset > max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[0].br_startoff))) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[0].br_startoff)); + break; + } + } else { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + break; + } + + /* + * No more extent at the given offst, return the total + * file size. + */ + if (nmap == 1) { + offset = isize; + break; + } + + if (map[1].br_startblock == HOLESTARTBLOCK) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + break; + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + if (offset > max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[1].br_startoff))) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[1].br_startoff)); + break; + } + } else { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + break; + } + } + + fsbno = map[1].br_startoff + map[1].br_blockcount; + } + + /* + * Landed in a delay allocated extent or a real data extent, + * if the next extent is landed in a hole or in an unwritten + * extent but without data committed in the page cache, return + * its offset. If the next extent has dirty data in page cache, + * but its offset starts past both the start block of the map + * and the seek offset, it still be a hole. + */ + if (map[0].br_startblock == DELAYSTARTBLOCK || + map[0].br_state == XFS_EXT_NORM) { + /* + * No more extent at the give offset, return the + * total file size. + */ + if (nmap == 1) { + offset = isize; + break; + } + + if (map[1].br_startblock == HOLESTARTBLOCK) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + break; + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_DIRTY, + &offset) || + xfs_has_unwritten_buffer(inode, &map[1], + PAGECACHE_TAG_WRITEBACK, + &offset)) { + if (offset > max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[1].br_startoff))) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, + map[1].br_startoff)); + break; + } + } else { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + break; + } + } + + fsbno = map[1].br_startoff + map[1].br_blockcount; + } + + /* Landed in a hole, its fine to return */ + if (map[0].br_startblock == HOLESTARTBLOCK) { + offset = max_t(loff_t, seekoff, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + break; + } + + /* Return ENXIO if beyond eof */ + if (XFS_FSB_TO_B(mp, fsbno) > isize) { + error = ENXIO; + goto out_lock; + } + } + + if (offset < start) + offset = start; + + if (offset != file->f_pos) + file->f_pos = offset; + +out_lock: + xfs_iunlock_map_shared(ip, lock); + if (error) + return -error; + + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset); + case SEEK_HOLE: + return xfs_seek_hole(file, offset); + default: + return -EOPNOTSUPP; + } +} + const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, -- 1.7.4.1 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs