Re: hole-punch vs fault

Matthew Wilcox <matthew@xxxxxx> · Thu, 28 Nov 2013 05:24:34 -0700

On Wed, Nov 27, 2013 at 09:44:54PM -0700, Matthew Wilcox wrote:
> Ohh.  We can't take i_mutex because that's an inversion with mmap_sem.

Or the simple option is just to use a different mutex.  That makes life
a bit more interesting because the filesystem now has to take the mutex
at the appropriate point to ensure that the pages found by fault won't
be damagable.  I think this is correct for ext2/4:

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a33764..25ce796 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1212,8 +1212,14 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	if (error)
 		return error;
 
+	if (mapping_is_xip(inode->i_mapping))
+		damage_lock(inode->i_mapping);
 	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
+	if (mapping_is_xip(inode->i_mapping)) {
+		damage_mapping(inode->i_mapping);
+		damage_unlock(inode->i_mapping);
+	}
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0757634..4d3d533 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3571,6 +3571,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
+	damage_lock(mapping);
 	/* Now release the pages and zero block aligned part of pages*/
 	if (last_block_offset > first_block_offset)
 		truncate_pagecache_range(inode, first_block_offset,
@@ -3610,6 +3611,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	ret = ext4_es_remove_extent(inode, first_block,
 				    stop_block - first_block);
 	if (ret) {
+		damage_unlock(mapping);
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto out_stop;
 	}
@@ -3622,6 +3624,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 					    stop_block);
 
 	ext4_discard_preallocations(inode);
+	damage_mapping(mapping);
+	damage_unlock(mapping);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 121f11f..1ed4fa7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -411,6 +411,8 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
+	struct mutex		i_damage_lock;	/* fs data structures */
+	unsigned		i_damaged;	/* damage count */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
@@ -499,6 +501,34 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 }
 
 /*
+ * A mapping is damaged when blocks are removed from the filesystem's
+ * data structures.
+ */
+static inline unsigned mapping_damage(struct address_space *mapping)
+{
+	unsigned seq = ACCESS_ONCE(mapping->i_damaged);
+	smp_rmb();	/* Subsequent reads of damagable data structures */
+	return seq;
+}
+
+#define damage_lock(mapping)	mutex_lock(&mapping->i_damage_lock)
+#define damage_unlock(mapping)	mutex_unlock(&mapping->i_damage_lock)
+
+/* Must be called with i_damage_lock held */
+static inline bool
+mapping_is_damaged(struct address_space *mapping, unsigned seq)
+{
+	return mapping->i_damaged != seq;
+}
+
+/* Must be called with i_damage_lock held */
+static inline void damage_mapping(struct address_space *mapping)
+{
+	smp_wmb();	/* Prior writes to damagable data structures */
+	mapping->i_damaged++;
+}
+
+/*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a9..f99e831 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1617,6 +1617,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
 	pgoff_t size;
+	unsigned damage = mapping_damage(mapping);
 	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1676,6 +1677,19 @@ retry_find:
 		return VM_FAULT_SIGBUS;
 	}
 
+	/*
+	 * Check if we were the unlucky victim of a holepunch
+	 */
+	damage_lock(mapping);
+	if (unlikely(mapping_is_damaged(mapping, damage))) {
+		damage_unlock(mapping);
+		unlock_page(page);
+		page_cache_release(page);
+		damage = mapping_damage(mapping);
+		goto retry_find;
+	}
+	damage_unlock(mapping);
+
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..dc478eb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -224,6 +224,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
+	unsigned damage;
 	pgoff_t size;
 	void *xip_mem;
 	unsigned long xip_pfn;
@@ -232,6 +233,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	/* XXX: are VM_FAULT_ codes OK? */
 again:
+	damage = mapping_damage(mapping);
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -260,8 +262,14 @@ again:
 		__xip_unmap(mapping, vmf->pgoff);
 
 found:
+		damage_lock(mapping);
+		if (unlikely(mapping_is_damaged(mapping, damage))) {
+			mutex_unlock(&inode->i_mutex);
+			goto again;
+		}
 		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
 							xip_pfn);
+		damage_unlock(mapping);
 		if (err == -ENOMEM)
 			return VM_FAULT_OOM;
 		/*

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html