From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> This glop of impossible-to-review code implements a number of ideas that need to be separated out. - Eliminate vm_ops->huge_fault. The core calls ->fault instead and callers who set VM_HUGEPAGE should be prepared to deal with FAULT_FLAG_SIZE_PMD (and larger) - Switch back to calling ->page_mkwrite instead of ->pfn_mkwrite. DAX now always has a page to lock, and no other imlementations of ->pfn_mkwrite exist. - dax_mkwrite splits out from dax_fault. dax_fault will now never call get_block() to allocate a block; only to see if a block has been allocated. dax_mkwrite will always attempt to allocate a block. - Filesystems now take their DAX allocation mutex in exclusive/write mode when calling dax_mkwrite. - Split out dax_insert_pmd_mapping() from dax_pmd_fault and share it with the new dax_pmd_mkwrite - Change dax_pmd_write to take a vm_fault argument like the rest of the family of functions. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- Documentation/filesystems/Locking | 8 - Documentation/filesystems/dax.txt | 5 +- fs/block_dev.c | 10 +- fs/dax.c | 433 +++++++++++++++++++++++++------------- fs/ext2/file.c | 35 +-- fs/ext4/file.c | 96 +++------ fs/xfs/xfs_file.c | 95 ++------- fs/xfs/xfs_trace.h | 2 - include/linux/dax.h | 4 +- include/linux/mm.h | 4 - mm/memory.c | 51 +---- mm/mmap.c | 2 +- 12 files changed, 359 insertions(+), 386 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 619af9b..1be09e7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -522,7 +522,6 @@ prototypes: void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); - int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: @@ -532,7 +531,6 @@ close: yes fault: yes can return with page locked map_pages: yes page_mkwrite: yes can return with page locked -pfn_mkwrite: yes access: yes ->fault() is called when a previously not present pte is about @@ -559,12 +557,6 @@ the page has been truncated, the filesystem should not look up a new page like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to retry the fault. - ->pfn_mkwrite() is the same as page_mkwrite but when the pte is -VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is -VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior -after this call is to make the pte read-write, unless pfn_mkwrite returns -an error. - ->access() is called when get_user_pages() fails in access_process_vm(), typically used to debug a process through /proc/pid/mem or ptrace. This function is needed only for diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 2fe9e74..ff62feb 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -62,9 +62,8 @@ Filesystem support consists of dax_do_io() instead of blockdev_direct_IO() if S_DAX is set - implementing an mmap file operation for DAX files which sets the VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to - include handlers for fault, huge_fault and page_mkwrite (which should - probably call dax_fault() and dax_mkwrite(), passing the appropriate - get_block() callback) + include handlers for fault and page_mkwrite (which should probably call + dax_fault() and dax_mkwrite(), passing the appropriate get_block() callback) - calling dax_truncate_page() instead of block_truncate_page() for DAX files - calling dax_zero_page_range() instead of zero_user() for DAX files - ensuring that there is sufficient locking between reads, writes, diff --git a/fs/block_dev.c b/fs/block_dev.c index a9474ac..78697fe 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1722,7 +1722,7 @@ static const struct address_space_operations def_blk_aops = { * * Finally, unlike the filemap_page_mkwrite() case there is no * filesystem superblock to sync against freezing. We still include a - * pfn_mkwrite callback for dax drivers to receive write fault + * page_mkwrite callback for dax drivers to receive write fault * notifications. */ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -1730,6 +1730,11 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return dax_fault(vma, vmf, blkdev_get_block, NULL); } +static int blkdev_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, blkdev_get_block, NULL); +} + static void blkdev_vm_open(struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(vma->vm_file); @@ -1754,8 +1759,7 @@ static const struct vm_operations_struct blkdev_dax_vm_ops = { .open = blkdev_vm_open, .close = blkdev_vm_close, .fault = blkdev_dax_fault, - .huge_fault = blkdev_dax_fault, - .pfn_mkwrite = blkdev_dax_fault, + .page_mkwrite = blkdev_dax_mkwrite, }; static const struct vm_operations_struct blkdev_default_vm_ops = { diff --git a/fs/dax.c b/fs/dax.c index dbaf62c..952c2c2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -372,7 +372,7 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index, if (sector == NO_SECTOR) { /* - * This can happen during correct operation if our pfn_mkwrite + * This can happen during correct operation if our page_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by @@ -584,7 +584,6 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sector_t block; pgoff_t size; int error; - int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) @@ -624,20 +623,8 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock_page; - if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) + return dax_load_hole(mapping, page, vmf); if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -655,16 +642,101 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf, PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); - error = -EIO; - goto out; + return VM_FAULT_SIGBUS; } } return VM_FAULT_LOCKED; } - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); + /* + * If we successfully insert a mapping to an unwritten extent, we + * need to convert the unwritten extent. If there is an error + * inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, + * same as for normal BH based IO completions. + */ + error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +static int dax_pte_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 1); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, @@ -675,16 +747,6 @@ static int dax_pte_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page = NULL; } - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) @@ -734,22 +796,101 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") -static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) +static int dax_insert_pmd_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = PMD_SIZE, + }; + struct block_device *bdev = bh->b_bdev; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long address = (unsigned long)vmf->virtual_address; + pgoff_t pgoff = linear_page_index(vma, address & PMD_MASK); + int major; + long length; + + length = dax_map_atomic(bdev, &dax); + if (length < 0) + return VM_FAULT_SIGBUS; + if (length < PMD_SIZE) { + dax_pmd_dbg(bh, address, "dax-length too small"); + goto unmap; + } + + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { + dax_pmd_dbg(bh, address, "pfn unaligned"); + goto unmap; + } + + if (!pfn_t_devmap(dax.pfn)) { + dax_pmd_dbg(bh, address, "pfn not in memmap"); + goto unmap; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_pmem(dax.addr, PMD_SIZE); + wmb_pmem(); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + } else + major = 0; + + dax_unmap_atomic(bdev, &dax); + + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_mkwrite() path. This sequence + * allows the dax_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + int error = dax_radix_entry(vma->vm_file->f_mapping, pgoff, + dax.sector, true, true); + if (error) { + dax_pmd_dbg(bh, address, "PMD radix insertion failed"); + goto fallback; + } + } + + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: %lx sect: %llx\n", + __func__, current->comm, address, + pfn_t_to_pfn(dax.pfn), + (unsigned long long) dax.sector); + return major | vmf_insert_pfn_pmd(vma, address, vmf->pmd, dax.pfn, + write); + + unmap: + dax_unmap_atomic(bdev, &dax); + fallback: + return VM_FAULT_FALLBACK; +} + +static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; + unsigned long address = (unsigned long)vmf->virtual_address; unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; - struct block_device *bdev; + bool write = vmf->flags & FAULT_FLAG_WRITE; pgoff_t size, pgoff; sector_t block; - int error, result = 0; - bool alloc = false; + int result = 0; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -757,7 +898,7 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, vmf->pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } @@ -791,14 +932,6 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; - if (!buffer_mapped(&bh) && write) { - if (get_block(inode, block, &bh, 1) != 0) - return VM_FAULT_SIGBUS; - alloc = true; - } - - bdev = bh.b_bdev; - /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop @@ -809,17 +942,6 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, return VM_FAULT_FALLBACK; } - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (alloc) { - loff_t lstart = pgoff << PAGE_SHIFT; - loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ - - truncate_pagecache_range(inode, lstart, lend); - } - i_mmap_lock_read(mapping); /* @@ -839,9 +961,9 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto fallback; } - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + if (!buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; - pmd_t entry; + pmd_t entry, *pmd = vmf->pmd; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { @@ -856,7 +978,7 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto fallback; } - dev_dbg(part_to_dev(bdev->bd_part), + dev_dbg(part_to_dev(bh.b_bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); @@ -867,75 +989,90 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); + result |= dax_insert_pmd_mapping(inode, &bh, vma, vmf); + } - if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; - } - if (length < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "dax-length too small"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { - dax_pmd_dbg(&bh, address, "pfn unaligned"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } + out: + i_mmap_unlock_read(mapping); - if (!pfn_t_devmap(dax.pfn)) { - dax_unmap_atomic(bdev, &dax); - dax_pmd_dbg(&bh, address, "pfn not in memmap"); - goto fallback; - } + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } - dax_unmap_atomic(bdev, &dax); + return result; - /* - * For PTE faults we insert a radix tree entry for reads, and - * leave it clean. Then on the first write we dirty the radix - * tree entry via the dax_pfn_mkwrite() path. This sequence - * allows the dax_pfn_mkwrite() call to be simpler and avoid a - * call into get_block() to translate the pgoff to a sector in - * order to be able to create a new radix tree entry. - * - * The PMD path doesn't have an equivalent to - * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through __dax_pmd_fault() - * twice. This means we can just skip inserting a radix tree - * entry completely on the initial read and just wait until - * the write to insert a dirty entry. - */ - if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } - } + fallback: + count_vm_event(THP_FAULT_FALLBACK); + result = VM_FAULT_FALLBACK; + goto out; +} - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: %lx sect: %llx\n", - __func__, current->comm, address, - pfn_t_to_pfn(dax.pfn), - (unsigned long long) dax.sector); - result |= vmf_insert_pfn_pmd(vma, address, pmd, - dax.pfn, write); +static int dax_pmd_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long address = (unsigned long)vmf->virtual_address; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + unsigned long pmd_addr = address & PMD_MASK; + struct block_device *bdev; + pgoff_t size, pgoff; + loff_t lstart, lend; + sector_t block; + int result = 0; + + pgoff = linear_page_index(vma, pmd_addr); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; + block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); + + bh.b_size = PMD_SIZE; + + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + + bdev = bh.b_bdev; + + /* + * If the filesystem isn't willing to tell us the length of a hole, + * just fall back to PTEs. Calling get_block 512 times in a loop + * would be silly. + */ + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "allocated block too small"); + return VM_FAULT_FALLBACK; + } + + /* Make sure no process has any zero pages covering this hole */ + lstart = pgoff << PAGE_SHIFT; + lend = lstart + PMD_SIZE - 1; /* inclusive */ + truncate_pagecache_range(inode, lstart, lend); + + i_mmap_lock_read(mapping); + + /* + * If a truncate happened while we were allocating blocks, we may + * leave blocks allocated to the file that are beyond EOF. We can't + * take i_mutex here, so just leave them hanging; they'll be freed + * when the file is deleted. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) { + result = VM_FAULT_SIGBUS; + goto out; } + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); + goto fallback; + } + + result |= dax_insert_pmd_mapping(inode, &bh, vma, vmf); out: i_mmap_unlock_read(mapping); @@ -951,9 +1088,13 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto out; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) +static int dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) +{ + return VM_FAULT_FALLBACK; +} +static int dax_pmd_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { return VM_FAULT_FALLBACK; } @@ -978,13 +1119,11 @@ static int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t iodone) { - unsigned long address = (unsigned long)vmf->virtual_address; switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { case FAULT_FLAG_SIZE_PTE: return dax_pte_fault(vma, vmf, get_block, iodone); case FAULT_FLAG_SIZE_PMD: - return dax_pmd_fault(vma, address, vmf->pmd, vmf->flags, - get_block, iodone); + return dax_pmd_fault(vma, vmf, get_block, iodone); default: return VM_FAULT_FALLBACK; } @@ -992,26 +1131,30 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, EXPORT_SYMBOL_GPL(dax_fault); /** - * dax_pfn_mkwrite - handle first write to DAX page + * dax_mkwrite - handle first write to a DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * @iodone: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required + * by write faults for filesystems that will return unwritten extent + * mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs + * does not support unwritten extents, then it should pass NULL. */ -int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t iodone) { - struct file *file = vma->vm_file; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); - return VM_FAULT_NOPAGE; + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + return dax_pte_mkwrite(vma, vmf, get_block, iodone); + case FAULT_FLAG_SIZE_PMD: + return dax_pmd_mkwrite(vma, vmf, get_block, iodone); + default: + return VM_FAULT_FALLBACK; + } } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +EXPORT_SYMBOL_GPL(dax_mkwrite); /** * dax_zero_page_range - zero a range within a page of a DAX file diff --git a/fs/ext2/file.c b/fs/ext2/file.c index cf6f78c..6028c63 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -49,13 +49,14 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); } - down_read(&ei->dax_sem); + down_read(&ei->dax_sem); ret = dax_fault(vma, vmf, ext2_get_block, NULL); - up_read(&ei->dax_sem); + if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); + return ret; } @@ -67,44 +68,18 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&ei->dax_sem); + down_write(&ei->dax_sem); ret = dax_mkwrite(vma, vmf, ext2_get_block, NULL); + up_write(&ei->dax_sem); - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - -static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - loff_t size; - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&ei->dax_sem); - - /* check that the faulting page hasn't raced with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vma, vmf); - - up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, - .huge_fault = ext2_dax_fault, .page_mkwrite = ext2_dax_mkwrite, - .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 71859ed..72dcece 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -196,99 +196,65 @@ out: static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; - handle_t *handle = NULL; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { - unsigned nblocks; - switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { - case FAULT_FLAG_SIZE_PTE: - nblocks = EXT4_DATA_TRANS_BLOCKS(sb); - break; - case FAULT_FLAG_SIZE_PMD: - nblocks = ext4_chunk_trans_blocks(inode, - PMD_SIZE / PAGE_SIZE); - break; - default: - return VM_FAULT_FALLBACK; - } - sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, nblocks); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); + } - if (IS_ERR(handle)) - result = VM_FAULT_SIGBUS; - else - result = dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + down_read(&EXT4_I(inode)->i_mmap_sem); + result = dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); + if (write) sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - int err; + int result; struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + handle_t *handle; + unsigned nblocks; + + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + nblocks = EXT4_DATA_TRANS_BLOCKS(sb); + break; + case FAULT_FLAG_SIZE_PMD: + nblocks = ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE); + break; + default: + return VM_FAULT_FALLBACK; + } sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - err = dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(inode->i_sb); - - return err; -} -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, nblocks); + if (IS_ERR(handle)) { + result = VM_FAULT_SIGBUS; + } else { + down_write(&EXT4_I(inode)->i_mmap_sem); + result = dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_journal_stop(handle); + } - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vma, vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); + sb_end_pagefault(inode->i_sb); - return ret; + return result; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, - .huge_fault = ext4_dax_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6db703b..f51f09a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1517,22 +1517,25 @@ xfs_filemap_page_mkwrite( struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_filemap_page_mkwrite(ip); sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; @@ -1544,15 +1547,17 @@ xfs_filemap_fault( struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_fault(XFS_I(inode)); + trace_xfs_filemap_fault(ip); - /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vma, vmf); + if (IS_DAX(inode) && vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { /* * we do not want to trigger unwritten extent conversion on read @@ -1563,88 +1568,18 @@ xfs_filemap_fault( ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); } else ret = filemap_fault(vma, vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - return ret; -} - -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int -xfs_filemap_huge_fault( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) - return VM_FAULT_FALLBACK; - - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - if (vmf->flags & FAULT_FLAG_WRITE) + if (IS_DAX(inode) && vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; } -/* - * pfn_mkwrite was originally intended to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. - */ -static int -xfs_filemap_pfn_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - - struct inode *inode = file_inode(vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - -} - static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, - .huge_fault = xfs_filemap_huge_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, - .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; STATIC int diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fb1f3e1..3f1515f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,9 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_huge_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), diff --git a/include/linux/dax.h b/include/linux/dax.h index 8e58c36..b9e745f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -12,8 +12,8 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); -int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +int dax_mkwrite(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); static inline bool vma_is_dax(struct vm_area_struct *vma) { diff --git a/include/linux/mm.h b/include/linux/mm.h index b9d0979..eac1aeb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -281,16 +281,12 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - int (*huge_fault)(struct vm_area_struct *, struct vm_fault *vmf); void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); - /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); - /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ diff --git a/mm/memory.c b/mm/memory.c index 03d49eb..0af34e2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2210,42 +2210,6 @@ oom: return VM_FAULT_OOM; } -/* - * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED - * mapping - */ -static int wp_pfn_shared(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - pmd_t *pmd) -{ - if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, address), - .virtual_address = (void __user *)(address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; - int ret; - - pte_unmap_unlock(page_table, ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) - return ret; - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*page_table, orig_pte)) { - pte_unmap_unlock(page_table, ptl); - return 0; - } - } - return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, - NULL, 0, 0); -} - static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, @@ -2324,12 +2288,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable and/or call ops->pfn_mkwrite. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(mm, vma, address, page_table, ptl, - orig_pte, pmd); + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); pte_unmap_unlock(page_table, ptl); return wp_page_copy(mm, vma, address, page_table, pmd, @@ -3282,8 +3247,8 @@ static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); - if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vma, &vmf); + if (vma->vm_ops->fault) + return vma->vm_ops->fault(vma, &vmf); return VM_FAULT_FALLBACK; } @@ -3299,8 +3264,8 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, if (vma_is_anonymous(vma)) return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vma, &vmf); + if (vma->vm_ops->page_mkwrite) + return vma->vm_ops->page_mkwrite(vma, &vmf); return VM_FAULT_FALLBACK; } diff --git a/mm/mmap.c b/mm/mmap.c index 407ab43..0d851cb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1490,7 +1490,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) return 0; /* The backer wishes to know when pages are first written to? */ - if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + if (vm_ops && vm_ops->page_mkwrite) return 1; /* The open routine did something to the protections that pgprot_modify -- 2.7.0.rc3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html