For now, only SLAB uses _mapcount field as a number of active objects in a slab, and other slab allocators do not use it. As 16 bits are enough for that, use remaining 16 bits of _mapcount as page_type even when SLAB is used. And then move PG_slab flag to page_type. As suggested by Matthew, store number of active objects in negative form and use helper when accessing or modifying it. Note that page_type is always placed in upper 16 bits of _mapcount to avoid confusing normal _mapcount as page_type. As underflow (actually I mean, yeah, overflow) is not a concern anymore, use more lower bits. Add more folio helpers for PAGE_TYPE_OPS() not to break existing slab implementations. Remove PG_slab check from PAGE_FLAGS_CHECK_AT_FREE. buddy will still check if _mapcount is properly set at free. Exclude PG_slab from hwpoison and show_page_flags() for now. Note that with this patch, page_mapped() and folio_mapped() always return false for slab page. Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Christoph Lameter <cl@xxxxxxxxx> Cc: Pekka Enberg <penberg@xxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Vlastimil Babka <vbabka@xxxxxxx> Cc: Naoya Horiguchi <naoya.horiguchi@xxxxxxx> Cc: Miaohe Lin <linmiaohe@xxxxxxxxxx> Cc: "Matthew Wilcox (Oracle)" <willy@xxxxxxxxxxxxx> Cc: Minchan Kim <minchan@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Muchun Song <songmuchun@xxxxxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Signed-off-by: Hyeonggon Yoo <42.hyeyoo@xxxxxxxxx> --- fs/proc/page.c | 13 ++---- include/linux/mm_types.h | 11 +++-- include/linux/page-flags.h | 77 ++++++++++++++++++++++++---------- include/trace/events/mmflags.h | 1 - kernel/crash_core.c | 3 +- mm/memory-failure.c | 8 ---- mm/slab.c | 44 ++++++++++++------- mm/slab.h | 3 +- 8 files changed, 98 insertions(+), 62 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index f2273b164535..101be8d5a74e 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -67,7 +67,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, */ ppage = pfn_to_online_page(pfn); - if (!ppage || PageSlab(ppage) || page_has_type(ppage)) + if (!ppage || page_has_type(ppage)) pcount = 0; else pcount = page_mapcount(ppage); @@ -124,11 +124,8 @@ u64 stable_page_flags(struct page *page) /* * pseudo flags for the well known (anonymous) memory mapped pages - * - * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; @@ -178,16 +175,14 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; + if (PageSlab(page)) + u |= 1 << KPF_SLAB; if (page_is_idle(page)) u |= 1 << KPF_IDLE; u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 834022721bc6..2f298d1b8cf5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -196,10 +196,13 @@ struct page { atomic_t _mapcount; /* - * If the page is neither PageSlab nor mappable to userspace, - * the value stored here may help determine what this page - * is used for. See page-flags.h for a list of page types - * which are currently stored here. + * If the page is not mappable to userspace, the value + * stored here may help determine what this page is used for. + * See page-flags.h for a list of page types which are currently + * stored here. + * + * Note that only upper half is used for page types and lower + * half is reserved for SLAB. */ unsigned int page_type; }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..31dda492cda5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -107,7 +107,6 @@ enum pageflags { PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_error, - PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, @@ -484,7 +483,6 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) -__PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ @@ -926,42 +924,72 @@ static inline bool is_page_hwpoison(struct page *page) } /* - * For pages that are never mapped to userspace (and aren't PageSlab), - * page_type may be used. Because it is initialised to -1, we invert the - * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and - * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of page_mapcount() won't be - * mistaken for a page type value. + * For pages that are never mapped to userspace, page_type may be used. + * Because it is initialised to -1, we invert the sense of the bit, + * so __SetPageFoo *clears* the bit used for PageFoo, and __ClearPageFoo + * *sets* the bit used for PageFoo. We reserve a few high and low bits + * so that an underflow or overflow of page_mapcount() won't be mistaken + * for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of page_mapcount */ -#define PAGE_MAPCOUNT_RESERVE -128 -#define PG_buddy 0x00000080 -#define PG_offline 0x00000100 -#define PG_table 0x00000200 -#define PG_guard 0x00000400 +#define PG_buddy 0x00010000 +#define PG_offline 0x00020000 +#define PG_table 0x00040000 +#define PG_guard 0x00080000 +#define PG_slab 0x00100000 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -static inline int page_has_type(struct page *page) +#define PAGE_TYPE_MASK 0xffff0000 + +static inline bool page_type_has_type(unsigned int page_type) { - return (int)page->page_type < PAGE_MAPCOUNT_RESERVE; + return ((int)page_type < (int)PAGE_TYPE_MASK); } -#define PAGE_TYPE_OPS(uname, lname) \ +static inline bool page_has_type(struct page *page) +{ + return page_type_has_type(page->page_type); +} + + +#define PAGE_TYPE_OPS(uname, lname, policy) \ static __always_inline int Page##uname(struct page *page) \ { \ + page = policy(page, 0); \ + return PageType(page, PG_##lname); \ +} \ +static __always_inline int folio_test_##lname(struct folio *folio) \ +{ \ + struct page *page = &folio->page; \ + \ return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ + page = policy(page, 1); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ +} \ +static __always_inline void __folio_set_##lname(struct folio *folio) \ +{ \ + struct page *page = &folio->page; \ + \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ + page = policy(page, 1); \ + VM_BUG_ON_PAGE(!Page##uname(page), page); \ + page->page_type |= PG_##lname; \ +} \ +static __always_inline void __folio_clear_##lname(struct folio *folio) \ +{ \ + struct page *page = &folio->page; \ + \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ } @@ -970,7 +998,7 @@ static __always_inline void __ClearPage##uname(struct page *page) \ * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -PAGE_TYPE_OPS(Buddy, buddy) +PAGE_TYPE_OPS(Buddy, buddy, PF_ANY) /* * PageOffline() indicates that the page is logically offline although the @@ -994,7 +1022,10 @@ PAGE_TYPE_OPS(Buddy, buddy) * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ -PAGE_TYPE_OPS(Offline, offline) +PAGE_TYPE_OPS(Offline, offline, PF_ANY) + +/* PageSlab() indicates that the page is used by slab subsystem. */ +PAGE_TYPE_OPS(Slab, slab, PF_NO_TAIL) extern void page_offline_freeze(void); extern void page_offline_thaw(void); @@ -1004,12 +1035,12 @@ extern void page_offline_end(void); /* * Marks pages in use as page tables. */ -PAGE_TYPE_OPS(Table, table) +PAGE_TYPE_OPS(Table, table, PF_ANY) /* * Marks guardpages used with debug_pagealloc. */ -PAGE_TYPE_OPS(Guard, guard) +PAGE_TYPE_OPS(Guard, guard, PF_ANY) extern bool is_free_buddy_page(struct page *page); @@ -1057,8 +1088,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + 1UL << PG_active | 1UL << PG_unevictable | \ + __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 11524cda4a95..72c11a16f771 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -112,7 +112,6 @@ {1UL << PG_lru, "lru" }, \ {1UL << PG_active, "active" }, \ {1UL << PG_workingset, "workingset" }, \ - {1UL << PG_slab, "slab" }, \ {1UL << PG_owner_priv_1, "owner_priv_1" }, \ {1UL << PG_arch_1, "arch_1" }, \ {1UL << PG_reserved, "reserved" }, \ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a0eb4d5cf557..f72437e4192f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -479,13 +479,14 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab) + VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 779a426d2cab..9494f47c4cee 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1145,7 +1145,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) #define mlock (1UL << PG_mlocked) #define lru (1UL << PG_lru) #define head (1UL << PG_head) -#define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state error_states[] = { @@ -1155,13 +1154,6 @@ static struct page_state error_states[] = { * PG_buddy pages only make a small fraction of all free pages. */ - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, MF_MSG_SLAB, me_kernel }, - { head, head, MF_MSG_HUGE, me_huge_page }, { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..da12e82aba41 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2265,6 +2265,21 @@ void __kmem_cache_release(struct kmem_cache *cachep) } } +static inline unsigned int slab_get_active(struct slab *slab) +{ + return ~(slab->page_type | PG_slab); +} + +static inline void slab_inc_active(struct slab *slab) +{ + slab->page_type--; +} + +static inline void slab_dec_active(struct slab *slab) +{ + slab->page_type++; +} + /* * Get the memory for a slab management obj. * @@ -2287,7 +2302,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *addr = slab_address(slab); slab->s_mem = addr + colour_off; - slab->active = 0; if (OBJFREELIST_SLAB(cachep)) freelist = NULL; @@ -2506,8 +2520,8 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) { void *objp; - objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); - slab->active++; + objp = index_to_obj(cachep, slab, get_free_obj(slab, slab_get_active(slab))); + slab_inc_active(slab); return objp; } @@ -2520,7 +2534,7 @@ static void slab_put_obj(struct kmem_cache *cachep, unsigned int i; /* Verify double free bug */ - for (i = slab->active; i < cachep->num; i++) { + for (i = slab_get_active(slab); i < cachep->num; i++) { if (get_free_obj(slab, i) == objnr) { pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); @@ -2528,11 +2542,11 @@ static void slab_put_obj(struct kmem_cache *cachep, } } #endif - slab->active--; + slab_dec_active(slab); if (!slab->freelist) slab->freelist = objp + obj_offset(cachep); - set_free_obj(slab, slab->active, objnr); + set_free_obj(slab, slab_get_active(slab), objnr); } /* @@ -2631,14 +2645,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) spin_lock(&n->list_lock); n->total_slabs++; - if (!slab->active) { + if (!slab_get_active(slab)) { list_add_tail(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else fixup_slab_list(cachep, n, slab, &list); STATS_INC_GROWN(cachep); - n->free_objects += cachep->num - slab->active; + n->free_objects += cachep->num - slab_get_active(slab); spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); @@ -2740,7 +2754,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, { /* move slabp to correct slabp list: */ list_del(&slab->slab_list); - if (slab->active == cachep->num) { + if (slab_get_active(slab) == cachep->num) { list_add(&slab->slab_list, &n->slabs_full); if (OBJFREELIST_SLAB(cachep)) { #if DEBUG @@ -2779,7 +2793,7 @@ static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&slab->slab_list); - if (!slab->active) { + if (!slab_get_active(slab)) { list_add_tail(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else @@ -2861,9 +2875,9 @@ static __always_inline int alloc_block(struct kmem_cache *cachep, * There must be at least one object available for * allocation. */ - BUG_ON(slab->active >= cachep->num); + BUG_ON(slab_get_active(slab) >= cachep->num); - while (slab->active < cachep->num && batchcount--) { + while (slab_get_active(slab) < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); @@ -3158,7 +3172,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slab->active == cachep->num); + BUG_ON(slab_get_active(slab) == cachep->num); obj = slab_get_obj(cachep, slab); n->free_objects--; @@ -3292,7 +3306,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (slab->active == 0) { + if (slab_get_active(slab) == 0) { list_add(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else { @@ -3347,7 +3361,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) struct slab *slab; list_for_each_entry(slab, &n->slabs_free, slab_list) { - BUG_ON(slab->active); + BUG_ON(slab_get_active(slab); i++; } diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..f9df0fc3a918 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -18,7 +18,8 @@ struct slab { struct kmem_cache *slab_cache; void *freelist; /* array of free object indexes */ void *s_mem; /* first object */ - unsigned int active; + /* lower half of page_type is used as active objects counter */ + unsigned int page_type; #elif defined(CONFIG_SLUB) -- 2.32.0