On Wed, Nov 27, 2013 at 09:44:54PM -0700, Matthew Wilcox wrote: > Ohh. We can't take i_mutex because that's an inversion with mmap_sem. Or the simple option is just to use a different mutex. That makes life a bit more interesting because the filesystem now has to take the mutex at the appropriate point to ensure that the pages found by fault won't be damagable. I think this is correct for ext2/4: diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 8a33764..25ce796 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1212,8 +1212,14 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; + if (mapping_is_xip(inode->i_mapping)) + damage_lock(inode->i_mapping); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); + if (mapping_is_xip(inode->i_mapping)) { + damage_mapping(inode->i_mapping); + damage_unlock(inode->i_mapping); + } inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0757634..4d3d533 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3571,6 +3571,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + damage_lock(mapping); /* Now release the pages and zero block aligned part of pages*/ if (last_block_offset > first_block_offset) truncate_pagecache_range(inode, first_block_offset, @@ -3610,6 +3611,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_es_remove_extent(inode, first_block, stop_block - first_block); if (ret) { + damage_unlock(mapping); up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } @@ -3622,6 +3624,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) stop_block); ext4_discard_preallocations(inode); + damage_mapping(mapping); + damage_unlock(mapping); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/include/linux/fs.h b/include/linux/fs.h index 121f11f..1ed4fa7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -411,6 +411,8 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ + struct mutex i_damage_lock; /* fs data structures */ + unsigned i_damaged; /* damage count */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ @@ -499,6 +501,34 @@ static inline int mapping_writably_mapped(struct address_space *mapping) } /* + * A mapping is damaged when blocks are removed from the filesystem's + * data structures. + */ +static inline unsigned mapping_damage(struct address_space *mapping) +{ + unsigned seq = ACCESS_ONCE(mapping->i_damaged); + smp_rmb(); /* Subsequent reads of damagable data structures */ + return seq; +} + +#define damage_lock(mapping) mutex_lock(&mapping->i_damage_lock) +#define damage_unlock(mapping) mutex_unlock(&mapping->i_damage_lock) + +/* Must be called with i_damage_lock held */ +static inline bool +mapping_is_damaged(struct address_space *mapping, unsigned seq) +{ + return mapping->i_damaged != seq; +} + +/* Must be called with i_damage_lock held */ +static inline void damage_mapping(struct address_space *mapping) +{ + smp_wmb(); /* Prior writes to damagable data structures */ + mapping->i_damaged++; +} + +/* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) diff --git a/mm/filemap.c b/mm/filemap.c index b7749a9..f99e831 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1617,6 +1617,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; + unsigned damage = mapping_damage(mapping); int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -1676,6 +1677,19 @@ retry_find: return VM_FAULT_SIGBUS; } + /* + * Check if we were the unlucky victim of a holepunch + */ + damage_lock(mapping); + if (unlikely(mapping_is_damaged(mapping, damage))) { + damage_unlock(mapping); + unlock_page(page); + page_cache_release(page); + damage = mapping_damage(mapping); + goto retry_find; + } + damage_unlock(mapping); + vmf->page = page; return ret | VM_FAULT_LOCKED; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3..dc478eb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -224,6 +224,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + unsigned damage; pgoff_t size; void *xip_mem; unsigned long xip_pfn; @@ -232,6 +233,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* XXX: are VM_FAULT_ codes OK? */ again: + damage = mapping_damage(mapping); size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -260,8 +262,14 @@ again: __xip_unmap(mapping, vmf->pgoff); found: + damage_lock(mapping); + if (unlikely(mapping_is_damaged(mapping, damage))) { + mutex_unlock(&inode->i_mutex); + goto again; + } err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn); + damage_unlock(mapping); if (err == -ENOMEM) return VM_FAULT_OOM; /* -- Matthew Wilcox Intel Open Source Technology Centre "Bill, look, we understand that you're interested in selling us this operating system, but compare it to ours. We can't possibly take such a retrograde step." -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html