[PATCH 6/8] dio: add an entry point which takes pages

Zach Brown <zach.brown@xxxxxxxxxx> · Thu, 22 Oct 2009 13:25:55 -0700

This adds a high level entry point into the direct-io code which calls helpers
on memory specified by pages instead of iovecs.

curr_user_address is used to decide if we should be dirtying the memory pages.
In our case, we don't want to.

The trick here is to initialize the dio state so that do_direct_IO() consumes
the pages we provide and never tries to map user pages.  This is done by making
sure that final_block_in_request covers all the pages we provide.

Signed-off-by: Zach Brown <zach.brown@xxxxxxxxxx>
---
 fs/direct-io.c     |   86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h |    4 ++
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0a2ba8e..3551c4a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,7 +109,7 @@ struct dio {
 	 */
 	int curr_page;			/* changes */
 	int total_pages;		/* doesn't change */
-	unsigned long curr_user_address;/* changes */
+	unsigned long curr_user_address;/* changes, indicates user pages */
 
 	/*
 	 * Page queue.  These variables belong to dio_refill_pages() and
@@ -337,7 +337,7 @@ static void dio_bio_submit(struct dio *dio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->curr_user_address)
 		bio_set_pages_dirty(bio);
 
 	submit_bio(dio->rw, bio);
@@ -403,13 +403,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (!uptodate)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->curr_user_address) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 			struct page *page = bvec[page_no].bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) && 
+			    dio->curr_user_address)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1248,3 +1249,80 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
+
+ssize_t
+__blockdev_direct_IO_pages(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct bio_vec *bvec, 
+	unsigned long bvec_len, loff_t offset, get_block_t get_block,
+	dio_iodone_t end_io, int dio_lock_type)
+{
+	unsigned blkbits = inode->i_blkbits;
+	ssize_t ret;
+	loff_t end = offset;
+	struct dio *dio;
+	unsigned long i;
+
+	if (dio_unaligned(offset, &blkbits, bdev)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	for (i = 0; i < bvec_len; i++) {
+		end += bvec[i].bv_len;
+		if (dio_unaligned(bvec[i].bv_len | bvec[i].bv_offset,
+				  &blkbits, bdev)) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	dio = dio_prepare(rw, iocb, inode, blkbits, offset, end, get_block,
+			  end_io, dio_lock_type);
+	if (!dio) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = dio_lock_and_flush(rw, inode, dio_lock_type, offset, end);
+	if (ret) {
+		kfree(dio);
+		goto out;
+	}
+
+	dio->pages_in_io = bvec_len;
+
+	for (i = 0; i < bvec_len; i++) {
+		dio->size += bvec[i].bv_len;
+
+		/* Index into the first page of the first block */
+		dio->first_block_in_page = bvec[i].bv_offset >> blkbits;
+		dio->final_block_in_request = dio->block_in_file +
+						(bvec[i].bv_len  >> blkbits);
+		/* Page fetching state */
+		dio->curr_page = 0;
+		page_cache_get(bvec[i].bv_page);
+		dio->pages[0] = bvec[i].bv_page;
+		dio->head = 0;
+		dio->tail = 1;
+
+		dio->total_pages = 1;
+		dio->curr_user_address = 0;
+	
+		ret = do_direct_IO(dio);
+
+		dio->result += bvec[i].bv_len -
+			((dio->final_block_in_request - dio->block_in_file) <<
+					blkbits);
+
+		if (ret) {
+			dio_cleanup(dio);
+			break;
+		}
+	}
+
+	ret = dio_post_submission(dio, offset, end, ret);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_pages);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba15f0..01c0b71 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2266,6 +2266,10 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	int lock_type);
+ssize_t __blockdev_direct_IO_pages(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev,
+	const struct bio_vec *bvec, unsigned long bvec_len, loff_t offset,
+	get_block_t get_block, dio_iodone_t end_io, int dio_lock_type);
 
 enum {
 	DIO_LOCKING = 1, /* need locking between buffered and direct access */
-- 
1.6.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html