On Thu, Dec 7, 2023 at 5:37 PM Christophe Leroy <christophe.leroy@xxxxxxxxxx> wrote: > > The subject says "riscv:" but it changes core part and several arch. > Maybe this commit should be split in two commits, one for API changes > that changes flush_tlb_fix_spurious_fault() to > flush_tlb_fix_spurious_write_fault() and adds > flush_tlb_fix_spurious_read_fault() including the change in memory.c, > then a second patch with the changes to riscv. You're right, I'll do that, thanks. > > Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit : > > The preventive sfence.vma were emitted because new mappings must be made > > visible to the page table walker, either the uarch caches invalid > > entries or not. > > > > Actually, there is no need to preventively sfence.vma on new mappings for > > userspace, this should be handled only in the page fault path. > > > > This allows to drastically reduce the number of sfence.vma emitted: > > > > * Ubuntu boot to login: > > Before: ~630k sfence.vma > > After: ~200k sfence.vma > > > > * ltp - mmapstress01 > > Before: ~45k > > After: ~6.3k > > > > * lmbench - lat_pagefault > > Before: ~665k > > After: 832 (!) > > > > * lmbench - lat_mmap > > Before: ~546k > > After: 718 (!) > > > > The only issue with the removal of sfence.vma in update_mmu_cache() is > > that on uarchs that cache invalid entries, those won't be invalidated > > until the process takes a fault: so that's an additional fault in those > > cases. > > > > Signed-off-by: Alexandre Ghiti <alexghiti@xxxxxxxxxxxx> > > --- > > arch/arm64/include/asm/pgtable.h | 2 +- > > arch/mips/include/asm/pgtable.h | 6 +-- > > arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 ++-- > > arch/riscv/include/asm/pgtable.h | 43 +++++++++++-------- > > include/linux/pgtable.h | 8 +++- > > mm/memory.c | 12 +++++- > > 6 files changed, 48 insertions(+), 31 deletions(-) > > Did you forget mm/pgtable-generic.c ? Indeed, I "missed" the occurrence of flush_tlb_fix_spurious_fault() there, thanks. > > > > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > > index 7f7d9b1df4e5..728f25f529a5 100644 > > --- a/arch/arm64/include/asm/pgtable.h > > +++ b/arch/arm64/include/asm/pgtable.h > > @@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void) > > * fault on one CPU which has been handled concurrently by another CPU > > * does not need to perform additional invalidation. > > */ > > -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) > > +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while (0) > > Why do you need to do that change ? Nothing is explained about that in > the commit message. I renamed this macro because in the page fault path, flush_tlb_fix_spurious_fault() is called only when the fault is a write fault (see https://elixir.bootlin.com/linux/latest/source/mm/memory.c#L5016). I'll check if that fits the occurrence in mm/pgtable-generic.c too. Thanks again for the review, Alex > > > > > /* > > * ZERO_PAGE is a global shared page that is always zero: used > > diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h > > index 430b208c0130..84439fe6ed29 100644 > > --- a/arch/mips/include/asm/pgtable.h > > +++ b/arch/mips/include/asm/pgtable.h > > @@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot) > > return __pgprot(prot); > > } > > > > -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, > > - unsigned long address, > > - pte_t *ptep) > > +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma, > > + unsigned long address, > > + pte_t *ptep) > > { > > } > > > > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h > > index 1950c1b825b4..7166d56f90db 100644 > > --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h > > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h > > @@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, > > #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) > > #endif /* CONFIG_SMP */ > > > > -#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault > > -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, > > - unsigned long address, > > - pte_t *ptep) > > +#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault > > +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma, > > + unsigned long address, > > + pte_t *ptep) > > { > > /* > > * Book3S 64 does not require spurious fault flushes because the PTE > > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > > index b2ba3f79cfe9..89aa5650f104 100644 > > --- a/arch/riscv/include/asm/pgtable.h > > +++ b/arch/riscv/include/asm/pgtable.h > > @@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, > > struct vm_area_struct *vma, unsigned long address, > > pte_t *ptep, unsigned int nr) > > { > > - /* > > - * The kernel assumes that TLBs don't cache invalid entries, but > > - * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a > > - * cache flush; it is necessary even after writing invalid entries. > > - * Relying on flush_tlb_fix_spurious_fault would suffice, but > > - * the extra traps reduce performance. So, eagerly SFENCE.VMA. > > - */ > > - while (nr--) > > - local_flush_tlb_page(address + nr * PAGE_SIZE); > > } > > #define update_mmu_cache(vma, addr, ptep) \ > > update_mmu_cache_range(NULL, vma, addr, ptep, 1) > > > > #define __HAVE_ARCH_UPDATE_MMU_TLB > > -#define update_mmu_tlb update_mmu_cache > > +static inline void update_mmu_tlb(struct vm_area_struct *vma, > > + unsigned long address, pte_t *ptep) > > +{ > > + flush_tlb_range(vma, address, address + PAGE_SIZE); > > +} > > > > static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, > > unsigned long address, pmd_t *pmdp) > > { > > - pte_t *ptep = (pte_t *)pmdp; > > - > > - update_mmu_cache(vma, address, ptep); > > } > > > > #define __HAVE_ARCH_PTE_SAME > > @@ -548,13 +540,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, > > unsigned long address, pte_t *ptep, > > pte_t entry, int dirty) > > { > > - if (!pte_same(*ptep, entry)) > > + if (!pte_same(*ptep, entry)) { > > __set_pte_at(ptep, entry); > > - /* > > - * update_mmu_cache will unconditionally execute, handling both > > - * the case that the PTE changed and the spurious fault case. > > - */ > > - return true; > > + /* Here only not svadu is impacted */ > > + flush_tlb_page(vma, address); > > + return true; > > + } > > + > > + return false; > > +} > > + > > +extern u64 nr_sfence_vma_handle_exception; > > +extern bool tlb_caching_invalid_entries; > > + > > +#define flush_tlb_fix_spurious_read_fault flush_tlb_fix_spurious_read_fault > > +static inline void flush_tlb_fix_spurious_read_fault(struct vm_area_struct *vma, > > + unsigned long address, > > + pte_t *ptep) > > +{ > > + if (tlb_caching_invalid_entries) > > + flush_tlb_page(vma, address); > > } > > > > #define __HAVE_ARCH_PTEP_GET_AND_CLEAR > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > > index af7639c3b0a3..7abaf42ef612 100644 > > --- a/include/linux/pgtable.h > > +++ b/include/linux/pgtable.h > > @@ -931,8 +931,12 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) > > # define pte_accessible(mm, pte) ((void)(pte), 1) > > #endif > > > > -#ifndef flush_tlb_fix_spurious_fault > > -#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) > > +#ifndef flush_tlb_fix_spurious_write_fault > > +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) flush_tlb_page(vma, address) > > +#endif > > + > > +#ifndef flush_tlb_fix_spurious_read_fault > > +#define flush_tlb_fix_spurious_read_fault(vma, address, ptep) > > #endif > > > > /* > > diff --git a/mm/memory.c b/mm/memory.c > > index 517221f01303..5cb0ccf0c03f 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -5014,8 +5014,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) > > * with threads. > > */ > > if (vmf->flags & FAULT_FLAG_WRITE) > > - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, > > - vmf->pte); > > + flush_tlb_fix_spurious_write_fault(vmf->vma, vmf->address, > > + vmf->pte); > > + else > > + /* > > + * With the pte_same(ptep_get(vmf->pte), entry) check > > + * that calls update_mmu_tlb() above, multiple threads > > + * faulting at the same time won't get there. > > + */ > > + flush_tlb_fix_spurious_read_fault(vmf->vma, vmf->address, > > + vmf->pte); > > } > > unlock: > > pte_unmap_unlock(vmf->pte, vmf->ptl);