Introduce SEEK_DATA/SEEK_HOLE to XFS V5

Jeff Liu <jeff.liu@xxxxxxxxxx> · Fri, 06 Jan 2012 21:28:58 +0800

Hello,

This is a revised patch according to Christoph's comments at V4.

Changes to V5:
--------------
* Revise xfs_has_unwritten_buffer() to lookup pages match tag.
* For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
  DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
* In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
  of the map,  treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).

Tests:
------
seek sanity tester:
http://patchwork.xfs.org/patch/3108/
seek copy tester:
http://patchwork.xfs.org/patch/3109/


Thanks,
-Jeff

Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx>

---
 fs/xfs/xfs_file.c |  466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 465 insertions(+), 1 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..24ae40a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
+#include <linux/pagevec.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
 	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
+/*
+ * Probe the data buffer offset in page cache for unwritten extents.
+ * Fetch all the pages match @tag, and iterate each page to find out
+ * if a buffer head state has BH_Unwritten or BH_Uptodate set.
+ */
+STATIC bool
+xfs_has_unwritten_buffer(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*map,
+	int			tag,
+	loff_t			*offset)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct pagevec		pvec;
+	pgoff_t			index;
+	pgoff_t			end;
+	bool			found = false;
+
+	pagevec_init(&pvec, 0);
+
+	index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
+	end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+			   >> PAGE_CACHE_SHIFT;
+
+	do {
+		unsigned int	i;
+		unsigned	nr_pages;
+		int		want = min_t(pgoff_t, end - index,
+					     (pgoff_t)PAGEVEC_SIZE - 1) + 1;
+		nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
+					      &index, tag, want);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page		*page = pvec.pages[i];
+			struct buffer_head	*bh;
+			struct buffer_head	*head;
+			xfs_fileoff_t		last;
+
+			if (!page_has_buffers(page))
+				continue;
+
+			/*
+			 * There is no need to check the following pages
+			 * if the current page offset is out of range.
+			 */
+			if (page->index > end)
+				goto out;
+
+			last = XFS_B_TO_FSBT(mp,
+					     page->index << PAGE_CACHE_SHIFT);
+
+			bh = head = page_buffers(page);
+			do {
+				/*
+				 * An extent in XFS_EXT_UNWRITTEN have disk
+				 * blocks already mapped to it, but no data
+				 * has been committed to them yet. If it has
+				 * dirty data in the page cache it can be
+				 * identified by having BH_Unwritten set in
+				 * each buffer. Also, the buffer head state
+				 * might be in BH_Uptodate too if the buffer
+				 * writeback procedure was fired, we need to
+				 * examine it as well.
+				 */
+				if (buffer_unwritten(bh) ||
+				    buffer_uptodate(bh)) {
+					found = true;
+					*offset = XFS_FSB_TO_B(mp, last);
+					goto out;
+				}
+				last++;
+			} while ((bh = bh->b_this_page) != head);
+		}
+
+		/*
+		 * If the number of probed pages less than our desired,
+		 * there should no more pages mapped, search done.
+		 */
+		if (nr_pages < want)
+			break;
+
+		index = pvec.pages[i - 1]->index + 1;
+		pagevec_release(&pvec);
+	} while (index < end);
+
+out:
+	pagevec_release(&pvec);
+	if (!found)
+		*offset = 0;
+
+	return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+	struct file		*file,
+	loff_t			start)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fsize_t		isize = i_size_read(inode);
+	loff_t			offset = 0;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		fsbno;
+	xfs_filblks_t		len;
+	int			lock;
+	int			error;
+
+	lock = xfs_ilock_map_shared(ip);
+
+	if (start >= isize) {
+		error = ENXIO;
+		goto out_lock;
+	}
+
+	fsbno = XFS_B_TO_FSBT(mp, start);
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	len = XFS_B_TO_FSB(mp, isize);
+
+	for (;;) {
+		struct xfs_bmbt_irec	map[2];
+		int			nmap = 2;
+		loff_t			seekoff;
+
+		error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+				       XFS_BMAPI_ENTIRE);
+		if (error)
+			goto out_lock;
+
+		/* No extents at given offset, must be beyond EOF */
+		if (!nmap) {
+			error = ENXIO;
+			goto out_lock;
+		}
+
+		seekoff = XFS_FSB_TO_B(mp, fsbno);
+		/*
+		 * Landed in a hole, skip to check the next extent.
+		 * If the next extent landed in an in-memory data extent,
+		 * or it is a normal extent, its fine to return.
+		 * If the next extent landed in a hole extent, calculate
+		 * the start file system block number for the next scan.
+		 * If the next extent landed in an unwritten extent, we
+		 * need to lookup the page cache to examine the data
+		 * buffer offset, if nothing found, treat it as a hole
+		 * extent too.
+		 */
+		if (map[0].br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * Return ENXIO if no data extent behind
+			 * the given offset. In this case, the seek
+			 * offset should be landed in a hole.
+			 */
+			if (nmap == 1) {
+				error = ENXIO;
+				break;
+			}
+
+			if (map[1].br_state == XFS_EXT_NORM ||
+			    map[1].br_startblock == DELAYSTARTBLOCK) {
+				offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+
+				break;
+			} else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+				if (xfs_has_unwritten_buffer(inode, &map[1],
+						     PAGECACHE_TAG_DIRTY,
+						     &offset) ||
+				    xfs_has_unwritten_buffer(inode, &map[1],
+						     PAGECACHE_TAG_WRITEBACK,
+						     &offset)) {
+					offset = max_t(loff_t, seekoff, offset);
+					break;
+				}
+			}
+
+			fsbno = map[1].br_startoff + map[1].br_blockcount;
+		}
+
+		/*
+		 * Landed in an unwritten extent, try to find out the data
+		 * buffer offset from page cache firstly. If nothing was
+		 * found, treat it as a hole, and skip to check the next
+		 * extent, something just like above.
+		 */
+		if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+			if (xfs_has_unwritten_buffer(inode, &map[0],
+						     PAGECACHE_TAG_DIRTY,
+						     &offset) ||
+			    xfs_has_unwritten_buffer(inode, &map[0],
+						     PAGECACHE_TAG_WRITEBACK,
+						     &offset)) {
+				offset = max_t(loff_t, seekoff, offset);
+				break;
+			}
+
+			/* No data extent at the given offset */
+			if (nmap == 1) {
+				error = ENXIO;
+				break;
+			}
+
+			if (map[1].br_state == XFS_EXT_NORM ||
+			    map[1].br_startblock == DELAYSTARTBLOCK) {
+				offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+				break;
+			} else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+				if (xfs_has_unwritten_buffer(inode, &map[1],
+						     PAGECACHE_TAG_DIRTY,
+						     &offset) ||
+				    xfs_has_unwritten_buffer(inode, &map[1],
+						     PAGECACHE_TAG_WRITEBACK,
+						     &offset)) {
+					offset = max_t(loff_t, seekoff, offset);
+					break;
+				}
+			}
+
+			fsbno = map[1].br_startoff + map[1].br_blockcount;
+		}
+
+		/* Landed in a delay allocated extent or a real data extent */
+		if (map[0].br_startblock == DELAYSTARTBLOCK ||
+		    map[0].br_state == XFS_EXT_NORM) {
+			offset = max_t(loff_t, seekoff,
+				       XFS_FSB_TO_B(mp, map[0].br_startoff));
+			break;
+		}
+
+		/* Return ENXIO if beyond eof */
+		if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+			error = ENXIO;
+			goto out_lock;
+		}
+	}
+
+	if (offset < start)
+		offset = start;
+
+	if (offset != file->f_pos)
+		file->f_pos = offset;
+
+out_lock:
+	xfs_iunlock_map_shared(ip, lock);
+	if (error)
+		return -error;
+
+	return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+	struct file		*file,
+	loff_t			start)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fsize_t		isize = i_size_read(inode);
+	loff_t			offset = 0;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		fsbno;
+	xfs_filblks_t		len;
+	int			lock;
+	int			error;
+
+	lock = xfs_ilock_map_shared(ip);
+
+	if (start >= isize) {
+		error = ENXIO;
+		goto out_lock;
+	}
+
+	fsbno = XFS_B_TO_FSBT(mp, start);
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	len = XFS_B_TO_FSB(mp, isize);
+
+	for (;;) {
+		struct xfs_bmbt_irec	map[2];
+		int			nmap = 2;
+		loff_t			seekoff;
+
+		error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+				       XFS_BMAPI_ENTIRE);
+		if (error)
+			goto out_lock;
+
+		/* No extents at given offset, must be beyond EOF */
+		if (!nmap) {
+			error = ENXIO;
+			goto out_lock;
+		}
+
+		seekoff = XFS_FSB_TO_B(mp, fsbno);
+		/*
+		 * Landed in an unwritten extent, try to lookup the page
+		 * cache to find out if there is dirty data or not. If
+		 * nothing was found, treate it as a hole. If there has
+		 * dirty data and its offset starts past both the start
+		 * block of the map and the current seek offset, it should
+		 * be treated as hole too. Otherwise, go through the next
+		 * extent to fetch holes.
+		 */
+		if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+			if (xfs_has_unwritten_buffer(inode, &map[0],
+						     PAGECACHE_TAG_DIRTY,
+						     &offset) ||
+			    xfs_has_unwritten_buffer(inode, &map[0],
+						     PAGECACHE_TAG_WRITEBACK,
+						     &offset)) {
+				if (offset > max_t(loff_t, seekoff,
+						   XFS_FSB_TO_B(mp,
+						   map[0].br_startoff))) {
+					offset = max_t(loff_t, seekoff,
+						       XFS_FSB_TO_B(mp,
+						       map[0].br_startoff));
+					break;
+				}
+			} else {
+				offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[0].br_startoff));
+				break;
+			}
+
+			/*
+			 * No more extent at the given offst, return the total
+			 * file size.
+			 */
+			if (nmap == 1) {
+				offset = isize;
+				break;
+			}
+
+			if (map[1].br_startblock == HOLESTARTBLOCK) {
+				offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+				break;
+			} else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+				if (xfs_has_unwritten_buffer(inode, &map[1],
+							PAGECACHE_TAG_DIRTY,
+							&offset) ||
+				    xfs_has_unwritten_buffer(inode, &map[1],
+							PAGECACHE_TAG_WRITEBACK,
+							&offset)) {
+					if (offset > max_t(loff_t, seekoff,
+							XFS_FSB_TO_B(mp,
+							map[1].br_startoff))) {
+						offset = max_t(loff_t, seekoff,
+							XFS_FSB_TO_B(mp,
+							map[1].br_startoff));
+						break;
+					}
+				} else {
+					offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+					break;
+				}
+			}
+
+			fsbno = map[1].br_startoff + map[1].br_blockcount;
+		}
+
+		/*
+		 * Landed in a delay allocated extent or a real data extent,
+		 * if the next extent is landed in a hole or in an unwritten
+		 * extent but without data committed in the page cache, return
+		 * its offset. If the next extent has dirty data in page cache,
+		 * but its offset starts past both the start block of the map
+		 * and the seek offset, it still be a hole.
+		 */
+		if (map[0].br_startblock == DELAYSTARTBLOCK ||
+		    map[0].br_state == XFS_EXT_NORM) {
+			/*
+			 * No more extent at the give offset, return the
+			 * total file size.
+			 */
+			if (nmap == 1) {
+				offset = isize;
+				break;
+			}
+
+			if (map[1].br_startblock == HOLESTARTBLOCK) {
+				offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+				break;
+			} else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+				if (xfs_has_unwritten_buffer(inode, &map[1],
+							PAGECACHE_TAG_DIRTY,
+							&offset) ||
+				    xfs_has_unwritten_buffer(inode, &map[1],
+							PAGECACHE_TAG_WRITEBACK,
+							&offset)) {
+					if (offset > max_t(loff_t, seekoff,
+							XFS_FSB_TO_B(mp,
+							map[1].br_startoff))) {
+						offset = max_t(loff_t, seekoff,
+							XFS_FSB_TO_B(mp,
+							map[1].br_startoff));
+						break;
+					}
+				} else {
+					offset = max_t(loff_t, seekoff,
+					XFS_FSB_TO_B(mp, map[1].br_startoff));
+					break;
+				}
+			}
+
+			fsbno = map[1].br_startoff + map[1].br_blockcount;
+		}
+
+		/* Landed in a hole, its fine to return */
+		if (map[0].br_startblock == HOLESTARTBLOCK) {
+			offset = max_t(loff_t, seekoff,
+				       XFS_FSB_TO_B(mp, map[0].br_startoff));
+			break;
+		}
+
+		/* Return ENXIO if beyond eof */
+		if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+			error = ENXIO;
+			goto out_lock;
+		}
+	}
+
+	if (offset < start)
+		offset = start;
+
+	if (offset != file->f_pos)
+		file->f_pos = offset;
+
+out_lock:
+	xfs_iunlock_map_shared(ip, lock);
+	if (error)
+		return -error;
+
+	return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+	struct file	*file,
+	loff_t		offset,
+	int		origin)
+{
+	switch (origin) {
+	case SEEK_END:
+	case SEEK_CUR:
+	case SEEK_SET:
+		return generic_file_llseek(file, offset, origin);
+	case SEEK_DATA:
+		return xfs_seek_data(file, offset);
+	case SEEK_HOLE:
+		return xfs_seek_hole(file, offset);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 const struct file_operations xfs_file_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= xfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= xfs_file_aio_read,
-- 
1.7.4.1






_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs