* Michel Lespinasse <michel@xxxxxxxxxxxxxx> [220128 08:10]: > Attempt speculative mm fault handling first, and fall back to the > existing (non-speculative) code if that fails. > > The speculative handling closely mirrors the non-speculative logic. > This includes some x86 specific bits such as the access_error() call. > This is why we chose to implement the speculative handling in arch/x86 > rather than in common code. > > The vma is first looked up and copied, under protection of the rcu > read lock. The mmap lock sequence count is used to verify the > integrity of the copied vma, and passed to do_handle_mm_fault() to > allow checking against races with mmap writers when finalizing the fault. > > Signed-off-by: Michel Lespinasse <michel@xxxxxxxxxxxxxx> > --- > arch/x86/mm/fault.c | 44 +++++++++++++++++++++++++++++++++++ > include/linux/mm_types.h | 5 ++++ > include/linux/vm_event_item.h | 4 ++++ > mm/vmstat.c | 4 ++++ > 4 files changed, 57 insertions(+) > > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c > index d0074c6ed31a..99b0a358154e 100644 > --- a/arch/x86/mm/fault.c > +++ b/arch/x86/mm/fault.c > @@ -1226,6 +1226,10 @@ void do_user_addr_fault(struct pt_regs *regs, > struct mm_struct *mm; > vm_fault_t fault; > unsigned int flags = FAULT_FLAG_DEFAULT; > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + struct vm_area_struct pvma; > + unsigned long seq; > +#endif > > tsk = current; > mm = tsk->mm; > @@ -1323,6 +1327,43 @@ void do_user_addr_fault(struct pt_regs *regs, > } > #endif > > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + count_vm_event(SPF_ATTEMPT); > + seq = mmap_seq_read_start(mm); > + if (seq & 1) > + goto spf_abort; > + rcu_read_lock(); > + vma = __find_vma(mm, address); > + if (!vma || vma->vm_start > address) { This fits the vma_lookup() pattern - although you will have to work around the locking issue still. This is the same for the other platforms too; they fit the pattern also. > + rcu_read_unlock(); > + goto spf_abort; > + } > + pvma = *vma; > + rcu_read_unlock(); > + if (!mmap_seq_read_check(mm, seq)) > + goto spf_abort; > + vma = &pvma; > + if (unlikely(access_error(error_code, vma))) > + goto spf_abort; > + fault = do_handle_mm_fault(vma, address, > + flags | FAULT_FLAG_SPECULATIVE, seq, regs); > + > + if (!(fault & VM_FAULT_RETRY)) > + goto done; > + > + /* Quick path to respond to signals */ > + if (fault_signal_pending(fault, regs)) { > + if (!user_mode(regs)) > + kernelmode_fixup_or_oops(regs, error_code, address, > + SIGBUS, BUS_ADRERR, > + ARCH_DEFAULT_PKEY); > + return; > + } > + > +spf_abort: > + count_vm_event(SPF_ABORT); > +#endif > + > /* > * Kernel-mode access to the user address space should only occur > * on well-defined single instructions listed in the exception > @@ -1419,6 +1460,9 @@ void do_user_addr_fault(struct pt_regs *regs, > } > > mmap_read_unlock(mm); > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > +done: > +#endif > if (likely(!(fault & VM_FAULT_ERROR))) > return; > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index b6678578a729..305f05d2a4bc 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -370,6 +370,11 @@ struct anon_vma_name { > * per VM-area/task. A VM area is any part of the process virtual memory > * space that has a special rule for the page-fault handlers (ie a shared > * library, the executable area etc). > + * > + * Note that speculative page faults make an on-stack copy of the VMA, > + * so the structure size matters. > + * (TODO - it would be preferable to copy only the required vma attributes > + * rather than the entire vma). > */ > struct vm_area_struct { > /* The first cache line has the info for VMA tree walking. */ > diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h > index 7b2363388bfa..f00b3e36ff39 100644 > --- a/include/linux/vm_event_item.h > +++ b/include/linux/vm_event_item.h > @@ -133,6 +133,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, > #ifdef CONFIG_X86 > DIRECT_MAP_LEVEL2_SPLIT, > DIRECT_MAP_LEVEL3_SPLIT, > +#endif > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + SPF_ATTEMPT, > + SPF_ABORT, > #endif > NR_VM_EVENT_ITEMS > }; > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 4057372745d0..dbb0160e5558 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1390,6 +1390,10 @@ const char * const vmstat_text[] = { > "direct_map_level2_splits", > "direct_map_level3_splits", > #endif > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + "spf_attempt", > + "spf_abort", > +#endif > #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ > }; > #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ > -- > 2.20.1 >