On Tue, May 04, 2021 at 05:30:26PM -0700, Rick Edgecombe wrote: > For x86, setting memory permissions on the direct map results in fracturing > large pages. Direct map fracturing can be reduced by locating pages that > will have their permissions set close together. > > Create a simple page cache that allocates pages from huge page size > blocks. Don't guarantee that a page will come from a huge page grouping, > instead fallback to non-grouped pages to fulfill the allocation if > needed. Also, register a shrinker such that the system can ask for the > pages back if needed. Since this is only needed when there is a direct > map, compile it out on highmem systems. I only had time to skim through the patches, I like the idea of having a simple cache that allocates larger pages with a fallback to basic page size. I just think it should be more generic and closer to the page allocator. I was thinking about adding a GFP flag that will tell that the allocated pages should be removed from the direct map. Then alloc_pages() could use such cache whenever this GFP flag is specified with a fallback for lower order allocations. > Free pages in the cache are kept track of in per-node list inside a > list_lru. NUMA_NO_NODE requests are serviced by checking each per-node > list in a round robin fashion. If pages are requested for a certain node > but the cache is empty for that node, a whole additional huge page size > page is allocated. > > Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx> > --- > arch/x86/include/asm/set_memory.h | 14 +++ > arch/x86/mm/pat/set_memory.c | 151 ++++++++++++++++++++++++++++++ > 2 files changed, 165 insertions(+) > > diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h > index 4352f08bfbb5..b63f09cc282a 100644 > --- a/arch/x86/include/asm/set_memory.h > +++ b/arch/x86/include/asm/set_memory.h > @@ -4,6 +4,9 @@ > > #include <asm/page.h> > #include <asm-generic/set_memory.h> > +#include <linux/gfp.h> > +#include <linux/list_lru.h> > +#include <linux/shrinker.h> > > /* > * The set_memory_* API can be used to change various attributes of a virtual > @@ -135,4 +138,15 @@ static inline int clear_mce_nospec(unsigned long pfn) > */ > #endif > > +struct grouped_page_cache { > + struct shrinker shrinker; > + struct list_lru lru; > + gfp_t gfp; > + atomic_t nid_round_robin; > +}; > + > +int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp); > +struct page *get_grouped_page(int node, struct grouped_page_cache *gpc); > +void free_grouped_page(struct grouped_page_cache *gpc, struct page *page); > + > #endif /* _ASM_X86_SET_MEMORY_H */ > diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c > index 16f878c26667..6877ef66793b 100644 > --- a/arch/x86/mm/pat/set_memory.c > +++ b/arch/x86/mm/pat/set_memory.c > @@ -2306,6 +2306,157 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, > return retval; > } > > +#ifndef HIGHMEM > +static struct page *__alloc_page_order(int node, gfp_t gfp_mask, int order) > +{ > + if (node == NUMA_NO_NODE) > + return alloc_pages(gfp_mask, order); > + > + return alloc_pages_node(node, gfp_mask, order); > +} > + > +static struct grouped_page_cache *__get_gpc_from_sc(struct shrinker *shrinker) > +{ > + return container_of(shrinker, struct grouped_page_cache, shrinker); > +} > + > +static unsigned long grouped_shrink_count(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + struct grouped_page_cache *gpc = __get_gpc_from_sc(shrinker); > + unsigned long page_cnt = list_lru_shrink_count(&gpc->lru, sc); > + > + return page_cnt ? page_cnt : SHRINK_EMPTY; > +} > + > +static enum lru_status grouped_isolate(struct list_head *item, > + struct list_lru_one *list, > + spinlock_t *lock, void *cb_arg) > +{ > + struct list_head *dispose = cb_arg; > + > + list_lru_isolate_move(list, item, dispose); > + > + return LRU_REMOVED; > +} > + > +static void __dispose_pages(struct grouped_page_cache *gpc, struct list_head *head) > +{ > + struct list_head *cur, *next; > + > + list_for_each_safe(cur, next, head) { > + struct page *page = list_entry(head, struct page, lru); > + > + list_del(cur); > + > + __free_pages(page, 0); > + } > +} > + > +static unsigned long grouped_shrink_scan(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + struct grouped_page_cache *gpc = __get_gpc_from_sc(shrinker); > + unsigned long isolated; > + LIST_HEAD(freeable); > + > + if (!(sc->gfp_mask & gpc->gfp)) > + return SHRINK_STOP; > + > + isolated = list_lru_shrink_walk(&gpc->lru, sc, grouped_isolate, > + &freeable); > + __dispose_pages(gpc, &freeable); > + > + /* Every item walked gets isolated */ > + sc->nr_scanned += isolated; > + > + return isolated; > +} > + > +static struct page *__remove_first_page(struct grouped_page_cache *gpc, int node) > +{ > + unsigned int start_nid, i; > + struct list_head *head; > + > + if (node != NUMA_NO_NODE) { > + head = list_lru_get_mru(&gpc->lru, node); > + if (head) > + return list_entry(head, struct page, lru); > + return NULL; > + } > + > + /* If NUMA_NO_NODE, search the nodes in round robin for a page */ > + start_nid = (unsigned int)atomic_fetch_inc(&gpc->nid_round_robin) % nr_node_ids; > + for (i = 0; i < nr_node_ids; i++) { > + int cur_nid = (start_nid + i) % nr_node_ids; > + > + head = list_lru_get_mru(&gpc->lru, cur_nid); > + if (head) > + return list_entry(head, struct page, lru); > + } > + > + return NULL; > +} > + > +/* Get and add some new pages to the cache to be used by VM_GROUP_PAGES */ > +static struct page *__replenish_grouped_pages(struct grouped_page_cache *gpc, int node) > +{ > + const unsigned int hpage_cnt = HPAGE_SIZE >> PAGE_SHIFT; > + struct page *page; > + int i; > + > + page = __alloc_page_order(node, gpc->gfp, HUGETLB_PAGE_ORDER); > + if (!page) > + return __alloc_page_order(node, gpc->gfp, 0); > + > + split_page(page, HUGETLB_PAGE_ORDER); > + > + for (i = 1; i < hpage_cnt; i++) > + free_grouped_page(gpc, &page[i]); > + > + return &page[0]; > +} > + > +int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp) > +{ > + int err = 0; > + > + memset(gpc, 0, sizeof(struct grouped_page_cache)); > + > + if (list_lru_init(&gpc->lru)) > + goto out; > + > + gpc->shrinker.count_objects = grouped_shrink_count; > + gpc->shrinker.scan_objects = grouped_shrink_scan; > + gpc->shrinker.seeks = DEFAULT_SEEKS; > + gpc->shrinker.flags = SHRINKER_NUMA_AWARE; > + > + err = register_shrinker(&gpc->shrinker); > + if (err) > + list_lru_destroy(&gpc->lru); > + > +out: > + return err; > +} > + > +struct page *get_grouped_page(int node, struct grouped_page_cache *gpc) > +{ > + struct page *page; > + > + page = __remove_first_page(gpc, node); > + > + if (page) > + return page; > + > + return __replenish_grouped_pages(gpc, node); > +} > + > +void free_grouped_page(struct grouped_page_cache *gpc, struct page *page) > +{ > + INIT_LIST_HEAD(&page->lru); > + list_lru_add_node(&gpc->lru, &page->lru, page_to_nid(page)); > +} > +#endif /* !HIGHMEM */ > /* > * The testcases use internal knowledge of the implementation that shouldn't > * be exposed to the rest of the kernel. Include these directly here. > -- > 2.30.2 > -- Sincerely yours, Mike.