Re: hole-punch vs fault

Matthew Wilcox <matthew@xxxxxx> · Wed, 27 Nov 2013 20:30:12 -0700

On Wed, Nov 27, 2013 at 07:33:43PM -0700, Matthew Wilcox wrote:
> Yeah, I don't think the page lock is enough (maybe someone can convince
> me otherwise).  How does this look?  Checking the generation number
> after we get i_mutex ensures that the truncate has finished running.

I should probably confess that my original reason for looking into this
was that I noticed that ext2-xip has a race between truncate() and fault.
Because there's no struct page, there's no page lock, so there's nothing to
ensure that the mapping doesn't go away while we're inserting the PTE.

So I looked to see how this was solved for hole punches, and found out
that it wasn't.  So I tried to come up with a solution that would work
for both.  I did implement the per-address space rwsem idea, but I don't
like it.

Here's a version that includes the changes to ext2 and filemap_xip:

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a33764..33c1d59 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1214,6 +1214,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
+	if (mapping_is_xip(inode->i_mapping))
+		damage_mapping(inode->i_mapping);
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0757634..6f90da6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3622,6 +3622,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 					    stop_block);
 
 	ext4_discard_preallocations(inode);
+	damage_mapping(mapping);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 121f11f..1db6032 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -411,6 +411,7 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
+	unsigned		i_damaged;	/* damage count */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
@@ -499,6 +500,31 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 }
 
 /*
+ * A mapping is damaged when blocks are removed from the filesystem's
+ * data structures.
+ */
+static inline unsigned mapping_damage(struct address_space *mapping)
+{
+	unsigned seq = ACCESS_ONCE(mapping->i_damaged);
+	smp_rmb();	/* Subsequent reads of damagable data structures */
+	return seq;
+}
+
+/* Must be called with i_mutex held */
+static inline bool
+mapping_is_damaged(struct address_space *mapping, unsigned seq)
+{
+	return mapping->i_damaged != seq;
+}
+
+/* Must be called with i_mutex held */
+static inline void damage_mapping(struct address_space *mapping)
+{
+	smp_wmb();	/* Prior writes to damagable data structures */
+	mapping->i_damaged++;
+}
+
+/*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a9..71c936d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1617,6 +1617,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
 	pgoff_t size;
+	unsigned damage = mapping_damage(mapping);
 	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1676,6 +1677,18 @@ retry_find:
 		return VM_FAULT_SIGBUS;
 	}
 
+	/*
+	 * Check if we were the unlucky victim of a holepunch
+	 */
+	mutex_lock(&inode->i_mutex);
+	if (unlikely(mapping_is_damaged(mapping, damage))) {
+		mutex_unlock(&inode->i_mutex);
+		unlock_page(page);
+		page_cache_release(page);
+		damage = mapping_damage(mapping);
+		goto retry_find;
+	}
+
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..c8bf231 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -224,6 +224,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
+	unsigned damage;
 	pgoff_t size;
 	void *xip_mem;
 	unsigned long xip_pfn;
@@ -232,6 +233,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	/* XXX: are VM_FAULT_ codes OK? */
 again:
+	damage = mapping_damage(mapping);
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -260,8 +262,14 @@ again:
 		__xip_unmap(mapping, vmf->pgoff);
 
 found:
+		mutex_lock(&inode->i_mutex);
+		if (unlikely(mapping_is_damaged(mapping, damage))) {
+			mutex_unlock(&inode->i_mutex);
+			goto again;
+		}
 		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
 							xip_pfn);
+		mutex_unlock(&inode->i_mutex);
 		if (err == -ENOMEM)
 			return VM_FAULT_OOM;
 		/*

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html