Mapping to zero page on reading memfd and COWing it when a write occurs. Signed-off-by: Peng Liang <liangpeng10@xxxxxxxxxx> --- include/linux/fs.h | 2 ++ include/uapi/linux/memfd.h | 1 + mm/memfd.c | 8 ++++++-- mm/memory.c | 37 ++++++++++++++++++++++++++++++++++--- mm/shmem.c | 10 ++++++++-- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..404c0c26ba98 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2249,6 +2249,7 @@ struct super_operations { #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ +#define S_ZEROPAGE (1 << 17) /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -2291,6 +2292,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) +#define IS_ZEROPAGE(inode) ((inode)->i_flags & S_ZEROPAGE) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..2bfac06f53fb 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,7 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +#define MFD_ZEROPAGE 0x0008U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..5c167b2de9ae 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -245,7 +245,7 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_ZEROPAGE) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -301,8 +301,12 @@ SYSCALL_DEFINE2(memfd_create, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); + if (flags & MFD_ZEROPAGE) { + file_inode(file)->i_flags |= S_ZEROPAGE; + } + } if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..360606964a7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3208,6 +3208,26 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) return ret; } +static vm_fault_t do_shared_fault(struct vm_fault *vmf); + +static vm_fault_t wp_zero_shared(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + vm_fault_t ret; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + ret = do_shared_fault(vmf); + mmu_notifier_invalidate_range_only_end(&range); + return ret; +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -3254,8 +3274,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(vmf); + (VM_WRITE|VM_SHARED)) { + if (unlikely(vma->vm_file && + IS_ZEROPAGE(file_inode(vma->vm_file)) && + is_zero_pfn(pte_pfn(*vmf->pte)))) { + return wp_zero_shared(vmf); + } else { + return wp_pfn_shared(vmf); + } + } pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); @@ -3970,12 +3997,16 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (unlikely(vma->vm_file && IS_ZEROPAGE(file_inode(vma->vm_file)) && + is_zero_pfn(page_to_pfn(page)))) + entry = pte_mkspecial(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); - } else { + } else if (likely(!vma->vm_file || !IS_ZEROPAGE(file_inode(vma->vm_file)) || + !is_zero_pfn(page_to_pfn(page)))) { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..f4b23124826d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1899,8 +1899,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: - page = shmem_alloc_and_acct_page(gfp, inode, - index, false); + if (IS_ZEROPAGE(inode) && vmf && + !(vmf->flags & FAULT_FLAG_WRITE)) { + page = ZERO_PAGE(0); + goto out; + } else { + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } } if (IS_ERR(page)) { int retry = 5; -- 2.33.1