From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> When we handle a write-fault on a DAX mapping, we currently insert a read-only mapping and then take the page fault again to convert it to a writable mapping. This is necessary for the case where we cover a hole with a read-only zero page, but when we have a data block already allocated, it is inefficient. Use the recently added vmf_insert_pfn_prot() to insert a writable mapping, even though the default VM flags say to use a read-only mapping. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- fs/dax.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 206650f..3f6138d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -519,9 +519,44 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); +/* + * The default page protections for DAX VMAs are set to "copy" so that + * we get notifications when zero pages are written to. This function + * is called when we're inserting a mapping to a data page. If this is + * a write fault, we've already done all the necessary accounting and + * it's pointless to insert this translation entry read-only. Convert + * the pgprot to be writable. + * + * While this is not the most elegant code, the compiler can see that (on + * any sane architecture) all four arms of the conditional are the same. + */ +static pgprot_t dax_pgprot(struct vm_area_struct *vma, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + if (!write) + return pgprot; + if ((vma->vm_flags & (VM_READ|VM_EXEC)) == (VM_READ|VM_EXEC)) + return __pgprot(pgprot_val(pgprot) ^ + pgprot_val(__P111) ^ + pgprot_val(__S111)); + else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_READ) + return __pgprot(pgprot_val(pgprot) ^ + pgprot_val(__P110) ^ + pgprot_val(__S110)); + else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_EXEC) + return __pgprot(pgprot_val(pgprot) ^ + pgprot_val(__P011) ^ + pgprot_val(__S011)); + else + return __pgprot(pgprot_val(pgprot) ^ + pgprot_val(__P010) ^ + pgprot_val(__S010)); +} + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { + bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; @@ -530,7 +565,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, .size = bh->b_size, }; pgoff_t size; - int error; + int result; i_mmap_lock_read(mapping); @@ -542,15 +577,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } + if (unlikely(vmf->pgoff >= size)) + goto sigbus; - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } + if (dax_map_atomic(bdev, &dax) < 0) + goto sigbus; if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); @@ -558,17 +589,19 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) - goto out; + if (dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, write)) + goto sigbus; - error = vm_insert_mixed(vma, vaddr, dax.pfn); + result = vmf_insert_pfn_prot(vma, vaddr, dax.pfn, + dax_pgprot(vma, write)); out: i_mmap_unlock_read(mapping); + return result; - return error; + sigbus: + result = VM_FAULT_SIGBUS; + goto out; } /** @@ -599,7 +632,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; - int error; + int result, error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -701,19 +734,19 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ - error = dax_insert_mapping(inode, &bh, vma, vmf); + result = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) - complete_unwritten(&bh, !error); + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } + return result | major; out: if (error == -ENOMEM) return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if ((error < 0) && (error != -EBUSY)) + if (error < 0) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; -- 2.7.0.rc3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>