From: Johannes Weiner <hannes@xxxxxxxxxxx> [ Upstream commit 759496ba6407c6994d6a5ce3a5e74937d7816208 ] Unlike global OOM handling, memory cgroup code will invoke the OOM killer in any OOM situation because it has no way of telling faults occuring in kernel context - which could be handled more gracefully - from user-triggered faults. Pass a flag that identifies faults originating in user space from the architecture-specific fault handlers to generic code so that memcg OOM handling can be improved. Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Reviewed-by: Michal Hocko <mhocko@xxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Cc: azurIt <azurit@xxxxxxxx> Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Cong Wang <xiyou.wangcong@xxxxxxxxx> Conflicts: arch/arc/mm/fault.c arch/powerpc/mm/fault.c --- arch/alpha/mm/fault.c | 7 ++++--- arch/arc/mm/fault.c | 6 ++++-- arch/arm/mm/fault.c | 9 ++++++--- arch/arm64/mm/fault.c | 17 ++++++++++------- arch/avr32/mm/fault.c | 2 ++ arch/cris/mm/fault.c | 6 ++++-- arch/frv/mm/fault.c | 10 ++++++---- arch/hexagon/mm/vm_fault.c | 6 ++++-- arch/ia64/mm/fault.c | 6 ++++-- arch/m32r/mm/fault.c | 10 ++++++---- arch/m68k/mm/fault.c | 2 ++ arch/metag/mm/fault.c | 6 ++++-- arch/microblaze/mm/fault.c | 7 +++++-- arch/mips/mm/fault.c | 6 ++++-- arch/mn10300/mm/fault.c | 2 ++ arch/openrisc/mm/fault.c | 1 + arch/parisc/mm/fault.c | 7 +++++-- arch/powerpc/mm/fault.c | 7 ++++--- arch/s390/mm/fault.c | 2 ++ arch/score/mm/fault.c | 7 ++++++- arch/sh/mm/fault.c | 9 ++++++--- arch/sparc/mm/fault_32.c | 12 +++++++++--- arch/sparc/mm/fault_64.c | 6 ++++-- arch/tile/mm/fault.c | 7 +++++-- arch/um/kernel/trap.c | 20 ++++++++++++-------- arch/unicore32/mm/fault.c | 8 ++++++-- arch/x86/mm/fault.c | 8 +++++--- arch/xtensa/mm/fault.c | 2 ++ include/linux/mm.h | 1 + 29 files changed, 135 insertions(+), 64 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 0c4132d..98838a0 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, const struct exception_table_entry *fixup; int fault, si_code = SEGV_MAPERR; siginfo_t info; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (cause > 0 ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* As of EV6, a load into $31/$f31 is a prefetch, and never faults (or is suppressed by the PALcode). Support that for older CPUs @@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (address >= TASK_SIZE) goto vmalloc_fault; #endif - + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -142,6 +142,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } /* If for any reason at all we couldn't handle the fault, diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 87f4281..50533b7 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -59,8 +59,7 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address, struct mm_struct *mm = tsk->mm; siginfo_t info; int fault, ret; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * We fault-in kernel-space virtual memory on-demand. The @@ -88,6 +87,8 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -115,6 +116,7 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address, if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2ebf4f6..160da6d 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; - int write = fsr & FSR_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (notify_page_fault(regs, fsr)) return 0; @@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (fsr & FSR_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f5f0c70..b5d4587 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (esr & ESR_LNX_EXEC) { - vm_flags = VM_EXEC; - } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { - vm_flags = VM_WRITE; - mm_flags |= FAULT_FLAG_WRITE; - } - tsk = current; mm = tsk->mm; @@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + mm_flags |= FAULT_FLAG_USER; + + if (esr & ESR_LNX_EXEC) { + vm_flags = VM_EXEC; + } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { + vm_flags = VM_WRITE; + mm_flags |= FAULT_FLAG_WRITE; + } + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index 2ca27b0..0eca933 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs) local_irq_enable(); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 73312ab..1790f22 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, struct vm_area_struct * vma; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; D(printk(KERN_DEBUG "Page fault for %lX on %X at %lX, prot %d write %d\n", @@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -155,6 +156,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, } else if (writeaccess == 1) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 331c1e2..9a66372 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear struct vm_area_struct *vma; struct mm_struct *mm; unsigned long _pme, lrai, lrad, fixup; + unsigned long flags = 0; siginfo_t info; pgd_t *pge; pud_t *pue; pte_t *pte; - int write; int fault; #if 0 @@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear if (in_atomic() || !mm) goto no_context; + if (user_mode(__frame)) + flags |= FAULT_FLAG_USER; + down_read(&mm->mmap_sem); vma = find_vma(mm, ear0); @@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear */ good_area: info.si_code = SEGV_ACCERR; - write = 0; switch (esr0 & ESR0_ATXC) { default: /* handle write to write protected page */ @@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; - write = 1; + flags |= FAULT_FLAG_WRITE; break; /* handle read from protected page */ @@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, ear0, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 1bd276d..8704c93 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) int si_code = SEGV_MAPERR; int fault; const struct exception_table_entry *fixup; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (cause > 0 ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * If we're in an interrupt or have no user context, @@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) local_irq_enable(); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -96,6 +97,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) case FLT_STORE: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; break; } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 6cf0341..7225dad 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); - flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); - /* mmap_sem is performance critical.... */ prefetchw(&mm->mmap_sem); @@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (notify_page_fault(regs, TRAP_BRKPT)) return; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (mask & VM_WRITE) + flags |= FAULT_FLAG_WRITE; retry: down_read(&mm->mmap_sem); diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 3cdfa9c..e9c6a80 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; struct vm_area_struct * vma; unsigned long page, addr; - int write; + unsigned long flags = 0; int fault; siginfo_t info; @@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (error_code & ACE_USERMODE) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, */ good_area: info.si_code = SEGV_ACCERR; - write = 0; switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { default: /* 3: write, present */ /* fall through */ case ACE_WRITE: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; - write++; + flags |= FAULT_FLAG_WRITE; break; case ACE_PROTECTION: /* read, present */ case 0: /* read, not present */ @@ -194,7 +196,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, addr, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index a563727..eb1d61f 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 8fddf46..332680e 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma, *prev_vma; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write_access ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; @@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); @@ -121,6 +122,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (write_access) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 731f739..fa4cf52 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, int code = SEGV_MAPERR; int is_write = error_code & ESR_S; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; regs->ear = address; regs->esr = error_code; @@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -199,6 +201,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, if (unlikely(is_write)) { if (unlikely(!(vma->vm_flags & VM_WRITE))) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index da47e9b..0214a43 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -41,8 +41,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ const int field = sizeof(unsigned long) * 2; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; #if 0 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), @@ -92,6 +91,8 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -113,6 +114,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long writ if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (cpu_has_rixi) { if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 8a2e6de..3516cbd 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, if (in_atomic() || !mm) goto no_context; + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 4a41f84..0703acf 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, if (user_mode(regs)) { /* Exception was in userspace: reenable interrupts */ local_irq_enable(); + flags |= FAULT_FLAG_USER; } else { /* If exception was in a syscall, then IRQ's may have * been enabled or disabled. If they were enabled, diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f247a34..d10d27a 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (acc_type & VM_WRITE) + flags |= FAULT_FLAG_WRITE; retry: down_read(&mm->mmap_sem); vma = find_vma_prev(mm, address, &prev_vma); @@ -203,8 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, * fault. */ - fault = handle_mm_fault(mm, vma, address, - flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); + fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 8726779..d9196c9 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ - if (is_write) - flags |= FAULT_FLAG_WRITE; - #ifdef CONFIG_PPC_ICSWX /* * we need to do this early because this "data storage @@ -280,6 +277,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -408,6 +408,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 047c3e4..416face 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access) address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 4b71a62..52238983 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; const int field = sizeof(unsigned long) * 2; + unsigned long flags = 0; siginfo_t info; int fault; @@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) @@ -95,6 +99,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) goto bad_area; @@ -105,7 +110,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 1f49c28..541dc61 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct mm_struct *mm; struct vm_area_struct * vma; int fault; - int write = error_code & FAULT_CODE_WRITE; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -476,6 +474,11 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, set_thread_fault_code(error_code); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (error_code & FAULT_CODE_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index e98bfda..59dbd46 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, unsigned long g2; int from_user = !(regs->psr & PSR_PS); int fault, code; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (text_fault) address = regs->pc; @@ -235,6 +234,11 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, goto bad_area; } + if (from_user) + flags |= FAULT_FLAG_USER; + if (write) + flags |= FAULT_FLAG_WRITE; + /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write) struct vm_area_struct *vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + unsigned int flags = FAULT_FLAG_USER; int code; code = SEGV_MAPERR; @@ -402,11 +407,12 @@ static void force_user_fault(unsigned long address, int write) if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { + switch (handle_mm_fault(mm, vma, address, flags)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index ea83f82..3841a08 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -323,7 +323,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) bad_kernel_pc(regs, address); return; } - } + } else + flags |= FAULT_FLAG_USER; /* * If we're in an interrupt or have no user @@ -426,13 +427,14 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) vma->vm_file != NULL) set_thread_fault_code(fault_code | FAULT_CODE_BLKCOMMIT); + + flags |= FAULT_FLAG_WRITE; } else { /* Allow reads even for write-only mappings */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index ac553ee..3ff289f 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs, if (!is_page_fault) write = 1; - flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); @@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } + if (!is_kernel_mode) + flags |= FAULT_FLAG_USER; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -425,6 +427,7 @@ static int handle_page_fault(struct pt_regs *regs, #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!is_page_fault || !(vma->vm_flags & VM_READ)) goto bad_area; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index b2f5adf..5c3aef7 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, pmd_t *pmd; pte_t *pte; int err = -EFAULT; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; *code_out = SEGV_MAPERR; @@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip, if (in_atomic()) goto out_nosemaphore; + if (is_user) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -58,12 +59,15 @@ int handle_page_fault(unsigned long address, unsigned long ip, good_area: *code_out = SEGV_ACCERR; - if (is_write && !(vma->vm_flags & VM_WRITE)) - goto out; - - /* Don't require VM_READ|VM_EXEC for write faults! */ - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) - goto out; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out; + flags |= FAULT_FLAG_WRITE; + } else { + /* Don't require VM_READ|VM_EXEC for write faults! */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out; + } do { int fault; diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 8ed3c45..0dc922d 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (!(fsr ^ 0x12)) + flags |= FAULT_FLAG_WRITE; + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c1e9e4c..78dee3e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1017,9 +1017,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -1089,6 +1087,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1113,6 +1112,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 4b7bc8d..70fa7bc 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs) address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); #endif + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); diff --git a/include/linux/mm.h b/include/linux/mm.h index a9a4830..044b011 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -167,6 +167,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ #define FAULT_FLAG_TRIED 0x40 /* second try */ +#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html