On Thu, Feb 02, 2023 at 05:56:22PM -0500, Peter Xu wrote:
IMHO it'll be merely impossible to merge things across most (if not to say, all) archs. It will need to be start from one or at least a few that still shares a major common base - I would still rely on x86 as a start - then we try to use the helper in as much archs as possible. Even on x86, I do also see challenges so I'm not sure whether a common enough routine can be abstracted indeed. But I believe there's a way to do this because obviously we still see tons of duplicated logics falling around. It may definitely need time to think out where's the best spot to start, and how to gradually move towards covering more archs starting from one.
FWIW, after going through everything from alpha to loongarch (in alphabetic order, skipping the itanic) the following seems to be suitable for all of them: generic_fault(address, flags, vm_flags, regs) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; vm_fault_t fault; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (unlikely(!mmap_read_trylock(mm))) { if (!(flags & FAULT_FLAG_USER) && !search_exception_tables(instruction_pointer(regs))) { /* * Fault from code in kernel from * which we do not expect faults. */ return KERN; } retry: mmap_read_lock(mm); } else { might_sleep(); #ifdef CONFIG_DEBUG_VM if (!(flags & FAULT_FLAG_USER) && !search_exception_tables(instruction_pointer(regs))) return KERN; #endif } vma = find_vma(mm, address); if (!vma) goto Eunmapped; if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto Eunmapped; if (addr < FIRST_USER_ADDRESS) goto Eunmapped; if (expand_stack(vma, address)) goto Eunmapped; } /* Ok, we have a good vm_area for this memory access, so we can handle it. */ if (!(vma->vm_flags & vm_flags)) goto Eaccess; /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ fault = handle_mm_fault(vma, address, flags, regs); if (unlikely(fault & VM_FAULT_RETRY)) { if (!(flags & FAULT_FLAG_USER)) { if (fatal_signal_pending(current)) return KERN; } else { if (signal_pending(current)) return FOAD; } flags |= FAULT_FLAG_TRIED; goto retry; } if (fault & VM_FAULT_COMPLETED) return DONE; mmap_read_unlock(mm); if (likely(!(fault & VM_FAULT_ERROR))) return DONE; if (!(flags & FAULT_FLAG_USER)) return KERN; if (fault & VM_FAULT_OOM) { pagefault_out_of_memory(); return FOAD; } if (fault & VM_FAULT_SIGSEGV) return SIGSEGV; if (fault & VM_FAULT_SIGBUS) return SIGBUS; if (fault & VM_FAULT_HWPOISON) return POISON + PAGE_SHIFT; // POISON == 256 if (fault & VM_FAULT_HWPOISON_LARGE) return POISON + hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); BUG(); Eunmapped: mmap_read_unlock(mm); return flags & FAULT_FLAG_USER ? MAPERR : KERN; Eaccess: mmap_read_unlock(mm); return flags & FAULT_FLAG_USER ? ACCERR : KERN; } possible return values (and that's obviously not the identifiers to be used for real; for now I'm just looking for feasibility of it all): DONE success, nothing else to be done FOAD OOM/fatal signal with VM_FAULT_RETRY/ signal with VM_FAULT_RETRY from userland - nothing to be done here. KERN kernel mode failed fault, fixup or oops MAPERR unmapped address, SIGSEGV/SEGV_MAPERR for you ACCERR nothing in vm_flags present in ->vm_flags of vma; SIGSEGV/SEGV_ACCERR SIGSEGV VM_FAULT_SIGSEGV; some architectures treat that as SEGV_MAPERR, some as SEGV_ACCERR. SIGBUS VM_FAULT_SIGBUS; SIGBUS/BUS_ADRERR POISON + shift VM_FAULT_HWPOISON and VM_FAULT_HWPOISON_LARGE, with log2(affected page size) encoded into return value. This is obviously not even close to final helper, but... alpha, arc, arm, arm64, csky, hexagon, loongarch convert to that cleanly. Itanic very much does not (due to weird dual stacks, awful address space layout, etc.), but then git rm arch/ia64 is long overdue. Fairly typical look after conversion: arc: { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; unsigned int mask; unsigned int flags; unsigned int res; /* * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (address >= VMALLOC_START && !user_mode(regs)) { if (unlikely(handle_kernel_vaddr_fault(address))) goto no_context; else return; } /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (faulthandler_disabled() || !mm) goto no_context; flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; mask = VM_READ; if (regs->ecr_cause & ECR_C_PROTV_STORE) { /* ST/EX */ flags |= FAULT_FLAG_WRITE; mask = VM_WRITE; } else if ((regs->ecr_vec == ECR_V_PROTV) && (regs->ecr_cause == ECR_C_PROTV_INST_FETCH)) { mask = VM_EXEC; } res = generic_fault(address, flags, mask, regs); if (likely(res == DONE)) return; if (res == FOAD) return; if (res == KERN) { no_context: if (fixup_exception(regs)) return; die("Oops", regs, address); } tsk->thread.fault_address = address; if (res == SIGBUS) force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address); else force_sig_fault(SIGSEGV, res == ACCERR ? SEGV_ACCERR : SEGV_MAPERR, (void __user *) address); } Or this arm64: { const struct fault_info *inf; struct mm_struct *mm = current->mm; unsigned long vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); unsigned int res; if (kprobe_page_fault(regs, esr)) return 0; /* * If we're in an interrupt or have no user context, we must not take * the fault. */ if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; /* * vm_flags tells us what bits we must have in vma->vm_flags * for the fault to be benign, __do_page_fault() would check * vma->vm_flags & vm_flags and returns an error if the * intersection is empty */ if (is_el0_instruction_abort(esr)) { /* It was exec fault */ vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; } else if (is_write_abort(esr)) { /* It was write fault */ vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } else { /* It was read fault */ vm_flags = VM_READ; /* Write implies read */ vm_flags |= VM_WRITE; /* If EPAN is absent then exec implies read */ if (!cpus_have_const_cap(ARM64_HAS_EPAN)) vm_flags |= VM_EXEC; } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); if (!search_exception_tables(regs->pc)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); } res = generic_fault(addr, mm_flags, vm_flags, regs); if (likely(res == DONE)) return 0; if (res == FOAD) return 0; if (res == KERN) { no_context: __do_kernel_fault(addr, esr, regs); return 0; } inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); if (res == SIGBUS) { /* * We had some memory, but were unable to successfully fix up * this page fault. */ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (res > POISON) { arm64_force_sig_mceerr(BUS_MCEERR_AR, far, res - POISON, inf->name); } else { /* * Something tried to access memory that isn't in our memory * map. */ arm64_force_sig_fault(SIGSEGV, res == ACCERR ? SEGV_ACCERR : SEGV_MAPERR, far, inf->name); } return 0; }