The patch titled mm: pagecache write deadlocks has been removed from the -mm tree. Its filename is mm-pagecache-write-deadlocks.patch This patch was dropped because it is obsolete ------------------------------------------------------ Subject: mm: pagecache write deadlocks From: Nick Piggin <npiggin@xxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- fs/buffer.c | 41 +++++++------------------ fs/libfs.c | 49 +++++++++++++++++------------- mm/filemap.c | 80 ++++++++++++++++++++++++++----------------------- mm/filemap.h | 74 +++++++++++++++++++++++++++++---------------- 4 files changed, 134 insertions(+), 110 deletions(-) diff -puN fs/buffer.c~mm-pagecache-write-deadlocks fs/buffer.c --- a/fs/buffer.c~mm-pagecache-write-deadlocks +++ a/fs/buffer.c @@ -1874,6 +1874,9 @@ static int __block_commit_write(struct i unsigned blocksize; struct buffer_head *bh, *head; + if (from == to) + return 0; + blocksize = 1 << inode->i_blkbits; for(bh = head = page_buffers(page), block_start = 0; @@ -2335,17 +2338,6 @@ int nobh_prepare_write(struct page *page if (is_mapped_to_disk) SetPageMappedToDisk(page); - SetPageUptodate(page); - - /* - * Setting the page dirty here isn't necessary for the prepare_write - * function - commit_write will do that. But if/when this function is - * used within the pagefault handler to ensure that all mmapped pages - * have backing space in the filesystem, we will need to dirty the page - * if its contents were altered. - */ - if (dirtied_it) - set_page_dirty(page); return 0; @@ -2354,17 +2346,6 @@ failed: if (read_bh[i]) free_buffer_head(read_bh[i]); } - - /* - * Error recovery is pretty slack. Clear the page and mark it dirty - * so we'll later zero out any blocks which _were_ allocated. - */ - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, PAGE_CACHE_SIZE); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - SetPageUptodate(page); - set_page_dirty(page); return ret; } EXPORT_SYMBOL(nobh_prepare_write); @@ -2372,13 +2353,16 @@ EXPORT_SYMBOL(nobh_prepare_write); int nobh_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (to > from) { + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - set_page_dirty(page); - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); + SetPageUptodate(page); + set_page_dirty(page); + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } } return 0; } @@ -2469,6 +2453,7 @@ int nobh_truncate_page(struct address_sp memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); set_page_dirty(page); } unlock_page(page); diff -puN fs/libfs.c~mm-pagecache-write-deadlocks fs/libfs.c --- a/fs/libfs.c~mm-pagecache-write-deadlocks +++ a/fs/libfs.c @@ -327,32 +327,41 @@ int simple_readpage(struct file *file, s int simple_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - if (!PageUptodate(page)) { - if (to - from != PAGE_CACHE_SIZE) { - void *kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, from); - memset(kaddr + to, 0, PAGE_CACHE_SIZE - to); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } - SetPageUptodate(page); + if (PageUptodate(page)) + return 0; + + ClearPageError(page); + if (to - from != PAGE_CACHE_SIZE) { + struct address_space *mapping = page->mapping; + int status = mapping->a_ops->readpage(file, page); + /* unlocked the page so must retry */ + /* XXX: AOP_TRUNCATED_PAGE is a horrible name */ + if (!status) + status = AOP_TRUNCATED_PAGE; + return status; } + return 0; } int simple_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) + unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (pos > inode->i_size) - i_size_write(inode, pos); - set_page_dirty(page); + if (to > from) { + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + if (to - from == PAGE_CACHE_SIZE) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (pos > inode->i_size) + i_size_write(inode, pos); + /* XXX: why no mark_inode_dirty? */ + set_page_dirty(page); + } return 0; } diff -puN mm/filemap.c~mm-pagecache-write-deadlocks mm/filemap.c --- a/mm/filemap.c~mm-pagecache-write-deadlocks +++ a/mm/filemap.c @@ -2102,10 +2102,9 @@ generic_file_buffered_write(struct kiocb bytes = min(cur_iov->iov_len - iov_offset, bytes); /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. + * Bring in the user page that we will copy from _first_, this + * minimises the chance we have to break into the slowpath + * below. */ fault_in_pages_readable(buf, bytes); @@ -2117,8 +2116,6 @@ generic_file_buffered_write(struct kiocb status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { - loff_t isize = i_size_read(inode); - if (status != AOP_TRUNCATED_PAGE) unlock_page(page); page_cache_release(page); @@ -2126,52 +2123,63 @@ generic_file_buffered_write(struct kiocb continue; /* * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. */ - if (pos + bytes > isize) - vmtruncate(inode, isize); + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); break; } + + /* + * Must not enter the pagefault handler here, because we hold + * the page lock, so we might recursively deadlock on the same + * lock, or get an ABBA deadlock against a different lock, or + * against the mmap_sem (which nests outside the page lock). + * So increment preempt count, and use _atomic usercopies. + */ + inc_preempt_count(); if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, + copied = filemap_copy_from_user_atomic(page, offset, buf, bytes); else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_offset, bytes); + copied = filemap_copy_from_user_iovec_atomic(page, + offset, cur_iov, iov_offset, + bytes); + dec_preempt_count(); + if (unlikely(copied != bytes)) + copied = 0; + flush_dcache_page(page); - status = a_ops->commit_write(file, page, offset, offset+bytes); + status = a_ops->commit_write(file, page, offset, offset+copied); if (status == AOP_TRUNCATED_PAGE) { page_cache_release(page); continue; } - if (likely(copied > 0)) { - if (!status) - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { - filemap_set_next_iovec(&cur_iov, - &iov_offset, status); - if (count) - buf = cur_iov->iov_base + - iov_offset; - } else { - iov_offset += status; - } - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; unlock_page(page); mark_page_accessed(page); page_cache_release(page); + if (status < 0) break; + if (likely(copied == bytes)) { + written += copied; + count -= copied; + pos += copied; + buf += copied; + if (unlikely(nr_segs > 1)) { + filemap_set_next_iovec(&cur_iov, + &iov_offset, copied); + if (count) + buf = cur_iov->iov_base + iov_offset; + } else { + iov_offset += copied; + } +#ifdef CONFIG_DEBUG_VM + } else { + fault_in_pages_readable(buf, bytes); +#endif + } balance_dirty_pages_ratelimited(mapping); cond_resched(); } while (count); diff -puN mm/filemap.h~mm-pagecache-write-deadlocks mm/filemap.h --- a/mm/filemap.h~mm-pagecache-write-deadlocks +++ a/mm/filemap.h @@ -22,19 +22,19 @@ __filemap_copy_from_user_iovec_inatomic( /* * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then clear the page - * out to (offset+bytes) and return the number of bytes which were copied. + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. * - * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache - * to *NOT* zero any tail of the buffer that it failed to copy. If it does, - * and if the following non-atomic copy succeeds, then there is a small window - * where the target page contains neither the data before the write, nor the - * data after the write (it contains zero). A read at this time will see - * data that is inconsistent with any ordering of the read and the write. - * (This has been detected in practice). + * NOTE: For this to work reliably we really want + * copy_from_user_inatomic_nocache to *NOT* zero any tail of the buffer that it + * failed to copy. If it does, and if the following non-atomic copy succeeds, + * then there is a small window where the target page contains neither the data + * before the write, nor the data after the write (it contains zero). A read at + * this time will see data that is inconsistent with any ordering of the read + * and the write. (This has been detected in practice). */ static inline size_t -filemap_copy_from_user(struct page *page, unsigned long offset, +filemap_copy_from_user_atomic(struct page *page, unsigned long offset, const char __user *buf, unsigned bytes) { char *kaddr; @@ -44,23 +44,32 @@ filemap_copy_from_user(struct page *page left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); kunmap_atomic(kaddr, KM_USER0); - if (left != 0) { - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); - kunmap(page); - } + return bytes - left; +} + +static inline size_t +filemap_copy_from_user_nonatomic(struct page *page, unsigned long offset, + const char __user *buf, unsigned bytes) +{ + char *kaddr; + int left; + + kaddr = kmap(page); + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + kunmap(page); + return bytes - left; } /* - * This has the same sideeffects and return value as filemap_copy_from_user(). + * This has the same sideeffects and return value as + * filemap_copy_from_user_atomic(). * The difference is that on a fault we need to memset the remainder of the * page (out to offset+bytes), to emulate filemap_copy_from_user()'s * single-segment behaviour. */ static inline size_t -filemap_copy_from_user_iovec(struct page *page, unsigned long offset, +filemap_copy_from_user_iovec_atomic(struct page *page, unsigned long offset, const struct iovec *iov, size_t base, size_t bytes) { char *kaddr; @@ -70,14 +79,27 @@ filemap_copy_from_user_iovec(struct page copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, base, bytes); kunmap_atomic(kaddr, KM_USER0); - if (copied != bytes) { - kaddr = kmap(page); - copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, - base, bytes); - if (bytes - copied) - memset(kaddr + offset + copied, 0, bytes - copied); - kunmap(page); - } + return copied; +} + +/* + * This has the same sideeffects and return value as + * filemap_copy_from_user_nonatomic(). + * The difference is that on a fault we need to memset the remainder of the + * page (out to offset+bytes), to emulate filemap_copy_from_user_nonatomic()'s + * single-segment behaviour. + */ +static inline size_t +filemap_copy_from_user_iovec_nonatomic(struct page *page, unsigned long offset, + const struct iovec *iov, size_t base, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, + base, bytes); + kunmap(page); return copied; } _ Patches currently in -mm which might be from npiggin@xxxxxxx are mm-comment-mmap_sem--lock_page-lockorder.patch mm-only-mm-debug-write-deadlocks.patch mm-fix-pagecache-write-deadlocks.patch mm-pagecache-write-deadlocks.patch oom-dont-kill-unkillable-children-or-siblings.patch oom-cleanup-messages.patch oom-less-memdie.patch mm-incorrect-vm_fault_oom-returns-from-drivers.patch mm-add-arch_alloc_page.patch radix-tree-rcu-lockless-readside.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html