- mm-pagecache-write-deadlocks.patch removed from -mm tree

akpm@xxxxxxxx · Fri, 13 Oct 2006 15:15:31 -0700

The patch titled

     mm: pagecache write deadlocks

has been removed from the -mm tree.  Its filename is

     mm-pagecache-write-deadlocks.patch

This patch was dropped because it is obsolete

------------------------------------------------------
Subject: mm: pagecache write deadlocks
From: Nick Piggin <npiggin@xxxxxxx>



Signed-off-by: Andrew Morton <akpm@xxxxxxxx>
---

 fs/buffer.c  |   41 +++++++------------------
 fs/libfs.c   |   49 +++++++++++++++++-------------
 mm/filemap.c |   80 ++++++++++++++++++++++++++-----------------------
 mm/filemap.h |   74 +++++++++++++++++++++++++++++----------------
 4 files changed, 134 insertions(+), 110 deletions(-)

diff -puN fs/buffer.c~mm-pagecache-write-deadlocks fs/buffer.c

--- a/fs/buffer.c~mm-pagecache-write-deadlocks
+++ a/fs/buffer.c
@@ -1874,6 +1874,9 @@ static int __block_commit_write(struct i
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
 
+	if (from == to)
+		return 0;
+
 	blocksize = 1 << inode->i_blkbits;
 
 	for(bh = head = page_buffers(page), block_start = 0;
@@ -2335,17 +2338,6 @@ int nobh_prepare_write(struct page *page
 
 	if (is_mapped_to_disk)
 		SetPageMappedToDisk(page);
-	SetPageUptodate(page);
-
-	/*
-	 * Setting the page dirty here isn't necessary for the prepare_write
-	 * function - commit_write will do that.  But if/when this function is
-	 * used within the pagefault handler to ensure that all mmapped pages
-	 * have backing space in the filesystem, we will need to dirty the page
-	 * if its contents were altered.
-	 */
-	if (dirtied_it)
-		set_page_dirty(page);
 
 	return 0;
 
@@ -2354,17 +2346,6 @@ failed:
 		if (read_bh[i])
 			free_buffer_head(read_bh[i]);
 	}
-
-	/*
-	 * Error recovery is pretty slack.  Clear the page and mark it dirty
-	 * so we'll later zero out any blocks which _were_ allocated.
-	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	SetPageUptodate(page);
-	set_page_dirty(page);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_prepare_write);
@@ -2372,13 +2353,16 @@ EXPORT_SYMBOL(nobh_prepare_write);
 int nobh_commit_write(struct file *file, struct page *page,
 		unsigned from, unsigned to)
 {
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (to > from) {
+		struct inode *inode = page->mapping->host;
+		loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 
-	set_page_dirty(page);
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		if (pos > inode->i_size) {
+			i_size_write(inode, pos);
+			mark_inode_dirty(inode);
+		}
 	}
 	return 0;
 }
@@ -2469,6 +2453,7 @@ int nobh_truncate_page(struct address_sp
 		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
+		SetPageUptodate(page);
 		set_page_dirty(page);
 	}
 	unlock_page(page);
diff -puN fs/libfs.c~mm-pagecache-write-deadlocks fs/libfs.c
--- a/fs/libfs.c~mm-pagecache-write-deadlocks
+++ a/fs/libfs.c
@@ -327,32 +327,41 @@ int simple_readpage(struct file *file, s
 int simple_prepare_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
-	if (!PageUptodate(page)) {
-		if (to - from != PAGE_CACHE_SIZE) {
-			void *kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr, 0, from);
-			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		SetPageUptodate(page);
+	if (PageUptodate(page))
+		return 0;
+
+	ClearPageError(page);
+	if (to - from != PAGE_CACHE_SIZE) {
+		struct address_space *mapping = page->mapping;
+		int status = mapping->a_ops->readpage(file, page);
+		/* unlocked the page so must retry */
+		/* XXX: AOP_TRUNCATED_PAGE is a horrible name */
+		if (!status)
+			status = AOP_TRUNCATED_PAGE;
+		return status;
 	}
+
 	return 0;
 }
 
 int simple_commit_write(struct file *file, struct page *page,
-			unsigned offset, unsigned to)
+			unsigned from, unsigned to)
 {
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_mutex.
-	 */
-	if (pos > inode->i_size)
-		i_size_write(inode, pos);
-	set_page_dirty(page);
+	if (to > from) {
+		struct inode *inode = page->mapping->host;
+		loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+		if (to - from == PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+		/*
+		 * No need to use i_size_read() here, the i_size
+		 * cannot change under us because we hold the i_mutex.
+		 */
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+			/* XXX: why no mark_inode_dirty? */
+		set_page_dirty(page);
+	}
 	return 0;
 }
 
diff -puN mm/filemap.c~mm-pagecache-write-deadlocks mm/filemap.c
--- a/mm/filemap.c~mm-pagecache-write-deadlocks
+++ a/mm/filemap.c
@@ -2102,10 +2102,9 @@ generic_file_buffered_write(struct kiocb
 		bytes = min(cur_iov->iov_len - iov_offset, bytes);
 
 		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
+		 * Bring in the user page that we will copy from _first_, this
+		 * minimises the chance we have to break into the slowpath
+		 * below.
 		 */
 		fault_in_pages_readable(buf, bytes);
 
@@ -2117,8 +2116,6 @@ generic_file_buffered_write(struct kiocb
 
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
-			loff_t isize = i_size_read(inode);
-
 			if (status != AOP_TRUNCATED_PAGE)
 				unlock_page(page);
 			page_cache_release(page);
@@ -2126,52 +2123,63 @@ generic_file_buffered_write(struct kiocb
 				continue;
 			/*
 			 * prepare_write() may have instantiated a few blocks
-			 * outside i_size.  Trim these off again.
+			 * outside i_size.  Trim these off again. Don't need
+			 * i_size_read because we hold i_mutex.
 			 */
-			if (pos + bytes > isize)
-				vmtruncate(inode, isize);
+			if (pos + bytes > inode->i_size)
+				vmtruncate(inode, inode->i_size);
 			break;
 		}
+
+		/*
+		 * Must not enter the pagefault handler here, because we hold
+		 * the page lock, so we might recursively deadlock on the same
+		 * lock, or get an ABBA deadlock against a different lock, or
+		 * against the mmap_sem (which nests outside the page lock).
+		 * So increment preempt count, and use _atomic usercopies.
+		 */
+		inc_preempt_count();
 		if (likely(nr_segs == 1))
-			copied = filemap_copy_from_user(page, offset,
+			copied = filemap_copy_from_user_atomic(page, offset,
 							buf, bytes);
 		else
-			copied = filemap_copy_from_user_iovec(page, offset,
-						cur_iov, iov_offset, bytes);
+			copied = filemap_copy_from_user_iovec_atomic(page,
+						offset, cur_iov, iov_offset,
+						bytes);
+		dec_preempt_count();
+		if (unlikely(copied != bytes))
+			copied = 0;
+
 		flush_dcache_page(page);
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		status = a_ops->commit_write(file, page, offset, offset+copied);
 		if (status == AOP_TRUNCATED_PAGE) {
 			page_cache_release(page);
 			continue;
 		}
-		if (likely(copied > 0)) {
-			if (!status)
-				status = copied;
-
-			if (status >= 0) {
-				written += status;
-				count -= status;
-				pos += status;
-				buf += status;
-				if (unlikely(nr_segs > 1)) {
-					filemap_set_next_iovec(&cur_iov,
-							&iov_offset, status);
-					if (count)
-						buf = cur_iov->iov_base +
-							iov_offset;
-				} else {
-					iov_offset += status;
-				}
-			}
-		}
-		if (unlikely(copied != bytes))
-			if (status >= 0)
-				status = -EFAULT;
 		unlock_page(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
+
 		if (status < 0)
 			break;
+		if (likely(copied == bytes)) {
+			written += copied;
+			count -= copied;
+			pos += copied;
+			buf += copied;
+			if (unlikely(nr_segs > 1)) {
+				filemap_set_next_iovec(&cur_iov,
+						&iov_offset, copied);
+				if (count)
+					buf = cur_iov->iov_base + iov_offset;
+			} else {
+				iov_offset += copied;
+			}
+#ifdef CONFIG_DEBUG_VM
+		} else {
+			fault_in_pages_readable(buf, bytes);
+#endif
+		}
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
 	} while (count);
diff -puN mm/filemap.h~mm-pagecache-write-deadlocks mm/filemap.h
--- a/mm/filemap.h~mm-pagecache-write-deadlocks
+++ a/mm/filemap.h
@@ -22,19 +22,19 @@ __filemap_copy_from_user_iovec_inatomic(
 
 /*
  * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
  *
- * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
- * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
- * and if the following non-atomic copy succeeds, then there is a small window
- * where the target page contains neither the data before the write, nor the
- * data after the write (it contains zero).  A read at this time will see
- * data that is inconsistent with any ordering of the read and the write.
- * (This has been detected in practice).
+ * NOTE: For this to work reliably we really want
+ * copy_from_user_inatomic_nocache to *NOT* zero any tail of the buffer that it
+ * failed to copy.  If it does, and if the following non-atomic copy succeeds,
+ * then there is a small window where the target page contains neither the data
+ * before the write, nor the data after the write (it contains zero).  A read at
+ * this time will see data that is inconsistent with any ordering of the read
+ * and the write.  (This has been detected in practice).
  */
 static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
+filemap_copy_from_user_atomic(struct page *page, unsigned long offset,
 			const char __user *buf, unsigned bytes)
 {
 	char *kaddr;
@@ -44,23 +44,32 @@ filemap_copy_from_user(struct page *page
 	left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	if (left != 0) {
-		/* Do it the slow way */
-		kaddr = kmap(page);
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
-		kunmap(page);
-	}
+	return bytes - left;
+}
+
+static inline size_t
+filemap_copy_from_user_nonatomic(struct page *page, unsigned long offset,
+			const char __user *buf, unsigned bytes)
+{
+	char *kaddr;
+	int left;
+
+	kaddr = kmap(page);
+	left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+	kunmap(page);
+
 	return bytes - left;
 }
 
 /*
- * This has the same sideeffects and return value as filemap_copy_from_user().
+ * This has the same sideeffects and return value as
+ * filemap_copy_from_user_atomic().
  * The difference is that on a fault we need to memset the remainder of the
  * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
  * single-segment behaviour.
  */
 static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+filemap_copy_from_user_iovec_atomic(struct page *page, unsigned long offset,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
 	char *kaddr;
@@ -70,14 +79,27 @@ filemap_copy_from_user_iovec(struct page
 	copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
 							 base, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
-	if (copied != bytes) {
-		kaddr = kmap(page);
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-								 base, bytes);
-		if (bytes - copied)
-			memset(kaddr + offset + copied, 0, bytes - copied);
-		kunmap(page);
-	}
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * filemap_copy_from_user_nonatomic().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user_nonatomic()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec_nonatomic(struct page *page, unsigned long offset,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
+							 base, bytes);
+	kunmap(page);
 	return copied;
 }
 
_

Patches currently in -mm which might be from npiggin@xxxxxxx are

mm-comment-mmap_sem--lock_page-lockorder.patch
mm-only-mm-debug-write-deadlocks.patch
mm-fix-pagecache-write-deadlocks.patch
mm-pagecache-write-deadlocks.patch
oom-dont-kill-unkillable-children-or-siblings.patch
oom-cleanup-messages.patch
oom-less-memdie.patch
mm-incorrect-vm_fault_oom-returns-from-drivers.patch
mm-add-arch_alloc_page.patch
radix-tree-rcu-lockless-readside.patch

-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html