Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
v3:
- Rebase to akpm/mm-unstable
- Copy arch maintainers
---
arch/alpha/mm/fault.c | 4 ++++
arch/arc/mm/fault.c | 4 ++++
arch/arm/mm/fault.c | 4 ++++
arch/arm64/mm/fault.c | 4 ++++
arch/csky/mm/fault.c | 4 ++++
arch/hexagon/mm/vm_fault.c | 4 ++++
arch/ia64/mm/fault.c | 4 ++++
arch/m68k/mm/fault.c | 4 ++++
arch/microblaze/mm/fault.c | 4 ++++
arch/mips/mm/fault.c | 4 ++++
arch/nios2/mm/fault.c | 4 ++++
arch/openrisc/mm/fault.c | 4 ++++
arch/parisc/mm/fault.c | 4 ++++
arch/powerpc/mm/copro_fault.c | 5 +++++
arch/powerpc/mm/fault.c | 5 +++++
arch/riscv/mm/fault.c | 4 ++++
arch/s390/mm/fault.c | 12 +++++++++++-
arch/sh/mm/fault.c | 4 ++++
arch/sparc/mm/fault_32.c | 4 ++++
arch/sparc/mm/fault_64.c | 5 +++++
arch/um/kernel/trap.c | 4 ++++
arch/x86/mm/fault.c | 4 ++++
arch/xtensa/mm/fault.c | 4 ++++
include/linux/mm_types.h | 2 ++
mm/gup.c | 34 +++++++++++++++++++++++++++++++++-
mm/memory.c | 2 +-
26 files changed, 138 insertions(+), 3 deletions(-)
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index ec20c1004abf..ef427a6bdd1a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -155,6 +155,10 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index dad27e4d69ff..5ca59a482632 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -146,6 +146,10 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/*
* Fault retry nuances, mmap_lock already relinquished by core mm
*/
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index a062e07516dd..46cccd6bf705 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -322,6 +322,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
return 0;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 77341b160aca..e401d416bbd6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -607,6 +607,10 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
return 0;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (fault & VM_FAULT_RETRY) {
mm_flags |= FAULT_FLAG_TRIED;
goto retry;
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 7215a46b6b8e..e15f736cca4b 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -285,6 +285,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED;
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 4fac4b9eb316..f73c7cbfe326 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -96,6 +96,10 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/* The most common case -- we are done. */
if (likely(!(fault & VM_FAULT_ERROR))) {
if (fault & VM_FAULT_RETRY) {
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 07379d1a227f..ef78c2d66cdd 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -139,6 +139,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We ran out of memory, or some other thing happened
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 71aa9f6315dc..4d2837eb3e2a 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -141,6 +141,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
if (fault_signal_pending(fault, regs))
return 0;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index a9626e6a68af..5c40c3ebe52f 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -222,6 +222,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 44f98100e84e..6f72bac39bf2 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -162,6 +162,10 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index a32f14cd72f2..edaca0a6c1c1 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -139,6 +139,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 80bb66ad42f6..c18f7abd64df 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -167,6 +167,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index f114e102aaf2..fe57175a7792 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -309,6 +309,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index c1cb21a00884..7c507fb48182 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
ret = 0;
*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (*flt & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d53fed4eccbd..014005428687 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -511,6 +511,10 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (fault_signal_pending(fault, regs))
return user_mode(regs) ? 0 : SIGBUS;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto out;
+
/*
* Handle the retry right now, the mmap_lock has been released in that
* case.
@@ -525,6 +529,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
+out:
/*
* Major/minor page fault accounting.
*/
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 4e9efbe46d5f..d6a87f4137ca 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -330,6 +330,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e173b6187ad5..9503a7cfaf03 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -339,6 +339,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
unsigned long address;
unsigned int flags;
vm_fault_t fault;
+ bool need_unlock = true;
bool is_write;
tsk = current;
@@ -433,6 +434,13 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ need_unlock = false;
+ goto out_gmap;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
@@ -452,6 +460,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
mmap_read_lock(mm);
goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
@@ -466,7 +475,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
}
fault = 0;
out_up:
- mmap_read_unlock(mm);
+ if (need_unlock)
+ mmap_read_unlock(mm);
out:
return fault;
}
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index e175667b1363..acd2f5e50bfc 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -485,6 +485,10 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (mm_fault_error(regs, error_code, address, fault))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index ad569d9bd124..91259f291c54 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -190,6 +190,10 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 253e07043298..4acc12eafbf5 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -427,6 +427,10 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
if (fault_signal_pending(fault, regs))
goto exit_exception;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto lock_released;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
@@ -449,6 +453,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
}
mmap_read_unlock(mm);
+lock_released:
mm_rss = get_mm_rss(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index d1d5d0be0308..d3ce21c4ca32 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,6 +76,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
goto out_nosemaphore;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM) {
goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d0074c6ed31a..12ed70b432d6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1408,6 +1408,10 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/*
* If we need to retry the mmap_lock has already been released,
* and if there is a fatal signal pending there is no guarantee
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 06d0973a0d74..5f64305ba8d7 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -118,6 +118,10 @@ void do_page_fault(struct pt_regs *regs)
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c09b7f0555b8..decc275db3c9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t;
* @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
* fsync() to complete (for synchronous page faults
* in DAX)
+ * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
@@ -746,6 +747,7 @@ enum vm_fault_reason {
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
+ VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
diff --git a/mm/gup.c b/mm/gup.c
index 2e07cff3b31b..3347b083d70b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -951,6 +951,25 @@ static int faultin_page(struct vm_area_struct *vma,
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+ * mmap lock in the page fault handler. Sanity check this.
+ */
+ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+ if (locked)
+ *locked = 0;
+ /*
+ * We should do the same as VM_FAULT_RETRY, but let's not
+ * return -EBUSY since that's not reflecting the reality on
+ * what has happened - we've just fully completed a page
+ * fault, with the mmap lock released. Use -EAGAIN to show
+ * that we want to take the mmap lock _again_.
+ */
+ return -EAGAIN;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -1177,6 +1196,7 @@ static long __get_user_pages(struct mm_struct *mm,
case 0:
goto retry;
case -EBUSY:
+ case -EAGAIN:
ret = 0;
fallthrough;
case -EFAULT:
@@ -1303,6 +1323,18 @@ int fixup_user_fault(struct mm_struct *mm,
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * NOTE: it's a pity that we need to retake the lock here
+ * to pair with the unlock() in the callers. Ideally we
+ * could tell the callers so they do not need to unlock.
+ */
+ mmap_read_lock(mm);
+ *unlocked = true;
+ return 0;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1368,7 +1400,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
- /* VM_FAULT_RETRY cannot return errors */
+ /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
BUG_ON(ret < 0);
BUG_ON(ret >= nr_pages);
diff --git a/mm/memory.c b/mm/memory.c
index 54d106e0c999..a8be2d7a8718 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3020,7 +3020,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
- return VM_FAULT_RETRY;
+ return VM_FAULT_COMPLETED;
}
}