Re: [RFC PATCH 1/2] page cache locking for O_DIRECT

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This changes O_DIRECT to take page locks or insert placeholder pages to
lock regions under direct io.

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx>

diff -r 18a9e9f5c707 fs/direct-io.c
--- a/fs/direct-io.c	Thu Oct 19 08:30:00 2006 +0700
+++ b/fs/direct-io.c	Fri Oct 20 12:38:24 2006 -0400
@@ -35,6 +35,7 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <asm/atomic.h>
+#include <linux/writeback.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -94,6 +95,14 @@ struct dio {
 	struct buffer_head map_bh;	/* last get_block() result */
 
 	/*
+	 * kernel page pinning
+	 */
+	struct page fake;
+	struct page **fspages;
+	unsigned long nr_fspages;
+	loff_t fs_start_off;
+
+	/*
 	 * Deferred addition of a page to the dio.  These variables are
 	 * private to dio_send_cur_page(), submit_page_section() and
 	 * dio_bio_add_page().
@@ -190,6 +199,66 @@ out:
 	return ret;	
 }
 
+static void unlock_page_range(struct address_space *mapping,
+			      struct page **pages,
+			      unsigned long start,
+                              unsigned long nr)
+{
+	unsigned long i;
+	struct page *p;
+	struct page *placeholder = NULL;
+	for (i = 0; i < nr; i++) {
+		p = pages[i];
+		if (PagePlaceHolder(p)) {
+			placeholder = p;
+			remove_placeholder_page(mapping, p, start + i);
+		} else {
+			unlock_page(p);
+			page_cache_release(p);
+		}
+	}
+	if (placeholder)
+		wake_up_placeholder_page(placeholder);
+}
+
+static int lock_page_range(struct address_space *mapping,
+			   struct page **pages,
+		           unsigned long start,
+			   unsigned long nr,
+			   struct page *fake)
+{
+	struct page *p;
+	unsigned long numlock = 0;
+	unsigned long end = start + nr;
+	loff_t end_bytes = end << PAGE_CACHE_SHIFT;
+	unsigned long i;
+	for (i = start ; i < end; i++) {
+		p = find_or_insert_page(mapping, i, GFP_KERNEL, fake);
+		if (!p)
+			goto fail;
+		if (PageDirty(p)) {
+			/* this page was dirty, so someone raced in and
+			 * did a write.  Start IO on the whole region
+			 * and try again
+			 */
+			unlock_page(p);
+			page_cache_release(p);
+			__filemap_fdatawrite_range(mapping,
+			                        i << PAGE_CACHE_SHIFT,
+						end_bytes, WB_SYNC_ALL);
+			continue;
+		}
+		pages[numlock++] = p;
+	}
+	/* now that we have all the pages locked, wait for any io */
+	wait_on_page_writeback_range(mapping, start, end);
+	return 0;
+fail:
+	unlock_page_range(mapping, pages, start, numlock);
+	return -1;
+}
+
+
 /*
  * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
  * buffered inside the dio so that we can call get_user_pages() against a
@@ -219,9 +288,8 @@ static void dio_complete(struct dio *dio
 {
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
-	if (dio->lock_type == DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
+	unlock_page_range(dio->inode->i_mapping, dio->fspages,
+	                  dio->fs_start_off, dio->nr_fspages);
 }
 
 /*
@@ -944,7 +1012,7 @@ out:
 }
 
 /*
- * Releases both i_mutex and i_alloc_sem
+ * Releases both i_mutex
  */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
@@ -1191,8 +1259,9 @@ __blockdev_direct_IO(int rw, struct kioc
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
 	int acquire_i_mutex = 0;
+	struct page **pages = NULL;
+	unsigned long nrpages;
 
 	if (rw & WRITE)
 		rw = WRITE_SYNC;
@@ -1221,12 +1290,21 @@ __blockdev_direct_IO(int rw, struct kioc
 				goto out;
 		}
 	}
-
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
 
+	memset(&dio->fake, 0, sizeof(struct page));
+	SetPagePlaceHolder(&dio->fake);
+	nrpages = (end + PAGE_CACHE_SIZE - 1 - offset) >> PAGE_CACHE_SHIFT;
+	dio->fs_start_off = offset >> PAGE_CACHE_SHIFT;
+	pages = kmalloc(sizeof(struct page *) * nrpages, GFP_KERNEL);
+	dio->fspages = pages;
+	dio->nr_fspages = nrpages;
+	if (lock_page_range(inode->i_mapping, pages, dio->fs_start_off, nrpages,
+	                    &dio->fake))
+		goto out;
 	/*
 	 * For block device access DIO_NO_LOCKING is used,
 	 *	neither readers nor writers do any locking at all
@@ -1240,30 +1318,11 @@ __blockdev_direct_IO(int rw, struct kioc
 	if (dio_lock_type != DIO_NO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
 		if (rw == READ && end > offset) {
-			struct address_space *mapping;
-
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
-
-			retval = filemap_write_and_wait_range(mapping, offset,
-							      end - 1);
-			if (retval) {
-				kfree(dio);
-				goto out;
-			}
-
 			if (dio_lock_type == DIO_OWN_LOCKING) {
 				mutex_unlock(&inode->i_mutex);
 				acquire_i_mutex = 1;
 			}
 		}
-
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
 	}
 
 	/*
@@ -1278,13 +1337,8 @@ __blockdev_direct_IO(int rw, struct kioc
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io, dio);
 
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
-
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
+	if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
 	return retval;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux