Add powerpc-specific pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This is awkward because the struct page contains only one rcu_head, but that page may be shared between PTE_FRAG_NR pagetables, each wanting to use the rcu_head at the same time: account concurrent deferrals with a heightened refcount, only the first making use of the rcu_head, but re-deferring if more deferrals arrived during its grace period. Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> --- arch/powerpc/include/asm/pgalloc.h | 4 +++ arch/powerpc/mm/pgtable-frag.c | 51 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 3360cad78ace..3a971e2a8c73 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) pte_fragment_free((unsigned long *)ptepage, 0); } +/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 20652daa1d7e..e4f58c5fc2ac 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -120,3 +120,54 @@ void pte_fragment_free(unsigned long *table, int kernel) __free_page(page); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define PTE_FREE_DEFERRED 0x10000 /* beyond any PTE_FRAG_NR */ + +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + int refcount; + + page = container_of(head, struct page, rcu_head); + refcount = atomic_sub_return(PTE_FREE_DEFERRED - 1, + &page->pt_frag_refcount); + if (refcount < PTE_FREE_DEFERRED) { + pte_fragment_free((unsigned long *)page_address(page), 0); + return; + } + /* + * One page may be shared between PTE_FRAG_NR pagetables. + * At least one more call to pte_free_defer() came in while we + * were already deferring, so the free must be deferred again; + * but just for one grace period, however many calls came in. + */ + while (refcount >= PTE_FREE_DEFERRED + PTE_FREE_DEFERRED) { + refcount = atomic_sub_return(PTE_FREE_DEFERRED, + &page->pt_frag_refcount); + } + /* Remove that refcount of 1 left for fragment freeing above */ + atomic_dec(&page->pt_frag_refcount); + call_rcu(&page->rcu_head, pte_free_now); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + /* + * One page may be shared between PTE_FRAG_NR pagetables: only queue + * it once for freeing, but note whenever the free must be deferred. + * + * (This would be much simpler if the struct page had an rcu_head for + * each fragment, or if we could allocate a separate array for that.) + * + * Convert our refcount of 1 to a refcount of PTE_FREE_DEFERRED, and + * proceed to call_rcu() only when the rcu_head is not already in use. + */ + if (atomic_add_return(PTE_FREE_DEFERRED - 1, &page->pt_frag_refcount) < + PTE_FREE_DEFERRED + PTE_FREE_DEFERRED) + call_rcu(&page->rcu_head, pte_free_now); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- 2.35.3