[RFC PATCH 17/19] mm: Convert mmap and mlock to use account_locked_vm

Alistair Popple <apopple@xxxxxxxxxx> · Tue, 24 Jan 2023 16:42:46 +1100

A future change introduces a cgroup to control the amount of
locked/pinned memory on the system. To ensure memory pinned via mlock
and mmap is accounted for use the common account_locked_vm()
function.

As cgroups can outlive individual processes also unaccount for the
locked memory during process teardown.

This patch should introduce no user visible change.

Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
 mm/internal.h  |  2 +-
 mm/mlock.c     | 76 ++++++++++-----------------------------------------
 mm/mmap.c      | 76 +++++++++++++++++++++++++--------------------------
 mm/mremap.c    | 54 ++++++++++++++++++++++++++----------
 mm/secretmem.c |  6 +---
 5 files changed, 95 insertions(+), 119 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8..7c8c3f2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -489,8 +489,6 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
 extern long faultin_vma_page_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end,
 				   bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
-			      unsigned long len);
 /*
  * mlock_vma_page() and munlock_vma_page():
  * should be called with vma's mmap_lock held for read or write,
diff --git a/mm/mlock.c b/mm/mlock.c
index 7032f6d..a97c8c5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -416,6 +416,20 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
 		goto out;
 
+	/*
+	 * Keep track of amount of locked VM.
+	 */
+	nr_pages = (end - start) >> PAGE_SHIFT;
+	if (!(newflags & VM_LOCKED)) {
+		__unaccount_locked_vm(mm, nr_pages);
+	} else if (!(oldflags & VM_LOCKED)) {
+		if (__account_locked_vm(mm, nr_pages, current,
+						capable(CAP_IPC_LOCK))) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma),
@@ -439,16 +453,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 success:
 	/*
-	 * Keep track of amount of locked VM.
-	 */
-	nr_pages = (end - start) >> PAGE_SHIFT;
-	if (!(newflags & VM_LOCKED))
-		nr_pages = -nr_pages;
-	else if (oldflags & VM_LOCKED)
-		nr_pages = 0;
-	mm->locked_vm += nr_pages;
-
-	/*
 	 * vm_flags is protected by the mmap_lock held in write mode.
 	 * It's okay if try_to_unmap_one unmaps a page just after we
 	 * set VM_LOCKED, populate_vma_page_range will bring it back.
@@ -517,42 +521,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
 }
 
 /*
- * Go through vma areas and sum size of mlocked
- * vma pages, as return value.
- * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
- * is also counted.
- * Return value: previously mlocked page counts
- */
-static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
-		unsigned long start, size_t len)
-{
-	struct vm_area_struct *vma;
-	unsigned long count = 0;
-	unsigned long end;
-	VMA_ITERATOR(vmi, mm, start);
-
-	/* Don't overflow past ULONG_MAX */
-	if (unlikely(ULONG_MAX - len < start))
-		end = ULONG_MAX;
-	else
-		end = start + len;
-
-	for_each_vma_range(vmi, vma, end) {
-		if (vma->vm_flags & VM_LOCKED) {
-			if (start > vma->vm_start)
-				count -= (start - vma->vm_start);
-			if (end < vma->vm_end) {
-				count += end - vma->vm_start;
-				break;
-			}
-			count += vma->vm_end - vma->vm_start;
-		}
-	}
-
-	return count >> PAGE_SHIFT;
-}
-
-/*
  * convert get_user_pages() return value to posix mlock() error
  */
 static int __mlock_posix_error_return(long retval)
@@ -585,21 +553,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
 	if (mmap_write_lock_killable(current->mm))
 		return -EINTR;
 
-	locked += current->mm->locked_vm;
-	if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
-		/*
-		 * It is possible that the regions requested intersect with
-		 * previously mlocked areas, that part area in "mm->locked_vm"
-		 * should not be counted to new mlock increment count. So check
-		 * and adjust locked count if necessary.
-		 */
-		locked -= count_mm_mlocked_page_nr(current->mm,
-				start, len);
-	}
-
-	/* check against resource limits */
-	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
-		error = apply_vma_lock_flags(start, len, flags);
+	error = apply_vma_lock_flags(start, len, flags);
 
 	mmap_write_unlock(current->mm);
 	if (error)
diff --git a/mm/mmap.c b/mm/mmap.c
index 425a934..2c05582 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,7 +160,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
 	if (IS_ERR_VALUE(mapped_addr))
 		return mapped_addr;
 
-	return mlock_future_check(current->mm, current->mm->def_flags, len);
+	return 0;
 }
 static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
 			 unsigned long newbrk, unsigned long oldbrk,
@@ -1184,23 +1184,6 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 	return hint;
 }
 
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
-		       unsigned long len)
-{
-	unsigned long locked, lock_limit;
-
-	/*  mlock MCL_FUTURE? */
-	if (flags & VM_LOCKED) {
-		locked = len >> PAGE_SHIFT;
-		locked += mm->locked_vm;
-		lock_limit = rlimit(RLIMIT_MEMLOCK);
-		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			return -EAGAIN;
-	}
-	return 0;
-}
-
 static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
@@ -1310,9 +1293,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		if (!can_do_mlock())
 			return -EPERM;
 
-	if (mlock_future_check(mm, vm_flags, len))
-		return -EAGAIN;
-
 	if (file) {
 		struct inode *inode = file_inode(file);
 		unsigned long flags_mask;
@@ -1882,22 +1862,27 @@ static int acct_stack_growth(struct vm_area_struct *vma,
 	if (size > rlimit(RLIMIT_STACK))
 		return -ENOMEM;
 
-	/* mlock limit tests */
-	if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
-		return -ENOMEM;
-
 	/* Check to ensure the stack will not grow into a hugetlb-only region */
 	new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
 			vma->vm_end - size;
 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
 		return -EFAULT;
 
+	/* mlock limit tests */
+	if (vma->vm_flags & VM_LOCKED)
+		if (__account_locked_vm(mm, grow << PAGE_SHIFT, current,
+						capable(CAP_IPC_LOCK)))
+			return -ENOMEM;
+
 	/*
 	 * Overcommit..  This must be the final test, as it will
 	 * update security statistics.
 	 */
-	if (security_vm_enough_memory_mm(mm, grow))
+	if (security_vm_enough_memory_mm(mm, grow)) {
+		if (vma->vm_flags & VM_LOCKED)
+			__unaccount_locked_vm(mm, grow << PAGE_SHIFT);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -1975,8 +1960,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				 * to guard against concurrent vma expansions.
 				 */
 				spin_lock(&mm->page_table_lock);
-				if (vma->vm_flags & VM_LOCKED)
-					mm->locked_vm += grow;
 				vm_stat_account(mm, vma->vm_flags, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
@@ -2056,8 +2039,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 				 * to guard against concurrent vma expansions.
 				 */
 				spin_lock(&mm->page_table_lock);
-				if (vma->vm_flags & VM_LOCKED)
-					mm->locked_vm += grow;
 				vm_stat_account(mm, vma->vm_flags, grow);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
@@ -2281,7 +2262,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
 		return -ENOMEM;
 
 	if (vma->vm_flags & VM_LOCKED)
-		vma->vm_mm->locked_vm -= vma_pages(vma);
+		__unaccount_locked_vm(vma->vm_mm, vma_pages(vma));
 
 	return 0;
 }
@@ -2525,6 +2506,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	struct vm_area_struct *next, *prev, *merge;
 	pgoff_t pglen = len >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	unsigned long locked = 0;
 	unsigned long end = addr + len;
 	unsigned long merge_start = addr, merge_end = end;
 	pgoff_t vm_pgoff;
@@ -2560,6 +2542,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		vm_flags |= VM_ACCOUNT;
 	}
 
+	if (vm_flags & VM_LOCKED) {
+		locked = len >> PAGE_SHIFT;
+		if (__account_locked_vm(mm, locked, current,
+						capable(CAP_IPC_LOCK))) {
+			error = -ENOMEM;
+			goto unacct_error;
+		}
+	}
+
 	next = mas_next(&mas, ULONG_MAX);
 	prev = mas_prev(&mas, 0);
 	if (vm_flags & VM_SPECIAL)
@@ -2605,7 +2596,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	vma = vm_area_alloc(mm);
 	if (!vma) {
 		error = -ENOMEM;
-		goto unacct_error;
+		goto unlock_error;
 	}
 
 	vma->vm_start = addr;
@@ -2725,8 +2716,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 					is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm))
 			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
-		else
-			mm->locked_vm += (len >> PAGE_SHIFT);
 	}
 
 	if (file)
@@ -2759,6 +2748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		mapping_unmap_writable(file->f_mapping);
 free_vma:
 	vm_area_free(vma);
+unlock_error:
+	if (locked)
+		__unaccount_locked_vm(mm, locked);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
@@ -2942,8 +2934,13 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	if (flags & VM_LOCKED)
+		if (__account_locked_vm(mm, len >> PAGE_SHIFT, current,
+					capable(CAP_IPC_LOCK)))
+			return -ENOMEM;
+
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
-		return -ENOMEM;
+		goto unacct_locked;
 
 	/*
 	 * Expand the existing vma if possible; Note that singular lists do not
@@ -2993,8 +2990,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 	mm->data_vm += len >> PAGE_SHIFT;
-	if (flags & VM_LOCKED)
-		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
 	validate_mm(mm);
 	return 0;
@@ -3003,6 +2998,8 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	vm_area_free(vma);
 unacct_fail:
 	vm_unacct_memory(len >> PAGE_SHIFT);
+unacct_locked:
+	__unaccount_locked_vm(mm, len >> PAGE_SHIFT);
 	return -ENOMEM;
 }
 
@@ -3064,7 +3061,7 @@ void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	unsigned long nr_accounted = 0;
+	unsigned long nr_accounted = 0, nr_locked = 0;
 	MA_STATE(mas, &mm->mm_mt, 0, 0);
 	int count = 0;
 
@@ -3107,6 +3104,8 @@ void exit_mmap(struct mm_struct *mm)
 	do {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
+		if (vma->vm_flags & VM_LOCKED)
+			nr_locked += vma_pages(vma);
 		remove_vma(vma);
 		count++;
 		cond_resched();
@@ -3116,6 +3115,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	trace_exit_mmap(mm);
 	__mt_destroy(&mm->mm_mt);
+	__unaccount_locked_vm(mm, nr_locked);
 	mmap_write_unlock(mm);
 	vm_unacct_memory(nr_accounted);
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5..67cc4f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -574,7 +574,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		bool *locked, unsigned long flags,
 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 {
-	long to_account = new_len - old_len;
+	long to_account = (new_len - old_len) >> PAGE_SHIFT;
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
 	unsigned long vm_flags = vma->vm_flags;
@@ -594,7 +594,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		return -ENOMEM;
 
 	if (unlikely(flags & MREMAP_DONTUNMAP))
-		to_account = new_len;
+		to_account = new_len >> PAGE_SHIFT;
 
 	if (vma->vm_ops && vma->vm_ops->may_split) {
 		if (vma->vm_start != old_addr)
@@ -618,16 +618,36 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		return err;
 
 	if (vm_flags & VM_ACCOUNT) {
-		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
+		if (security_vm_enough_memory_mm(mm, to_account))
 			return -ENOMEM;
 	}
 
+	/*
+	 * MREMAP_DONTUNMAP clears VM_LOCKED on the old vma and
+	 * implies new_len == old_len so no need to account locked
+	 * pages.
+	 */
+	if ((vm_flags & VM_LOCKED) && likely(!(flags & MREMAP_DONTUNMAP))) {
+		if (__account_locked_vm(mm, to_account, current,
+					capable(CAP_IPC_LOCK))) {
+			if (vm_flags & VM_ACCOUNT)
+				vm_unacct_memory(to_account);
+			return -ENOMEM;
+		}
+		*locked = true;
+	}
+
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 			   &need_rmap_locks);
 	if (!new_vma) {
 		if (vm_flags & VM_ACCOUNT)
-			vm_unacct_memory(to_account >> PAGE_SHIFT);
+			vm_unacct_memory(to_account);
+		if ((vm_flags & VM_LOCKED) &&
+		    likely(!(flags & MREMAP_DONTUNMAP))) {
+			__unaccount_locked_vm(mm, to_account);
+			*locked = false;
+		}
 		return -ENOMEM;
 	}
 
@@ -696,10 +716,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_end == (old_addr + old_len))
 			unlink_anon_vmas(vma);
 
-		/* Because we won't unmap we don't need to touch locked_vm */
 		return new_addr;
 	}
 
+	/* Make sure do_munmap() doesn't unaccount locked pages */
+	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
@@ -707,15 +728,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		excess = 0;
 	}
 
-	if (vm_flags & VM_LOCKED) {
-		mm->locked_vm += new_len >> PAGE_SHIFT;
-		*locked = true;
-	}
-
 	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
+		vma->vm_flags = vm_flags;
 		vma->vm_flags |= VM_ACCOUNT;
 		if (split)
 			find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
@@ -768,9 +785,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
 		return ERR_PTR(-EFAULT);
 
-	if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
-		return ERR_PTR(-EAGAIN);
-
 	if (!may_expand_vm(mm, vma->vm_flags,
 				(new_len - old_len) >> PAGE_SHIFT))
 		return ERR_PTR(-ENOMEM);
@@ -1026,6 +1040,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 				}
 			}
 
+			if (vma->vm_flags & VM_LOCKED) {
+				if (__account_locked_vm(mm, pages, current,
+						capable(CAP_IPC_LOCK))) {
+					if (vma->vm_flags & VM_ACCOUNT)
+						vm_unacct_memory(pages);
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+
 			/*
 			 * Function vma_merge() is called on the extension we are adding to
 			 * the already existing vma, vma_merge() will merge this extension with
@@ -1038,14 +1062,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 					extension_pgoff, vma_policy(vma),
 					vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 			if (!vma) {
+				// TODO: We always unacct memory
+				// regardless of VM_ACCOUNT flags?
 				vm_unacct_memory(pages);
+				__unaccount_locked_vm(mm, pages);
 				ret = -ENOMEM;
 				goto out;
 			}
 
 			vm_stat_account(mm, vma->vm_flags, pages);
 			if (vma->vm_flags & VM_LOCKED) {
-				mm->locked_vm += pages;
 				locked = true;
 				new_addr = addr;
 			}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9..4515eb4 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,13 +120,11 @@ static int secretmem_release(struct inode *inode, struct file *file)
 
 static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	unsigned long len = vma->vm_end - vma->vm_start;
-
 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
 		return -EINVAL;
 
-	if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
-		return -EAGAIN;
+	if (account_locked_vm(vma->vm_mm, vma->vm_end - vma->vm_start))
+		return -ENOMEM;
 
 	vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
 	vma->vm_ops = &secretmem_vm_ops;
-- 
git-series 0.9.1