On Mon, Aug 26, 2024 at 5:46 AM Yunsheng Lin <linyunsheng@xxxxxxxxxx> wrote: > > Currently there is one 'struct page_frag' for every 'struct > sock' and 'struct task_struct', we are about to replace the > 'struct page_frag' with 'struct page_frag_cache' for them. > Before begin the replacing, we need to ensure the size of > 'struct page_frag_cache' is not bigger than the size of > 'struct page_frag', as there may be tens of thousands of > 'struct sock' and 'struct task_struct' instances in the > system. > > By or'ing the page order & pfmemalloc with lower bits of > 'va' instead of using 'u16' or 'u32' for page size and 'u8' > for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. > And page address & pfmemalloc & order is unchanged for the > same page in the same 'page_frag_cache' instance, it makes > sense to fit them together. > > After this patch, the size of 'struct page_frag_cache' should be > the same as the size of 'struct page_frag'. > > CC: Alexander Duyck <alexander.duyck@xxxxxxxxx> > Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx> > --- > include/linux/mm_types_task.h | 19 ++++++----- > include/linux/page_frag_cache.h | 60 +++++++++++++++++++++++++++++++-- > mm/page_frag_cache.c | 51 +++++++++++++++------------- > 3 files changed, 97 insertions(+), 33 deletions(-) > > diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h > index cdc1e3696439..a8635460e027 100644 > --- a/include/linux/mm_types_task.h > +++ b/include/linux/mm_types_task.h > @@ -50,18 +50,21 @@ struct page_frag { > #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) > #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) > struct page_frag_cache { > - void *va; > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > + /* encoded_page consists of the virtual address, pfmemalloc bit and order > + * of a page. > + */ > + unsigned long encoded_page; > + > + /* we maintain a pagecount bias, so that we dont dirty cache line > + * containing page->_refcount every time we allocate a fragment. > + */ > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) > __u16 offset; > - __u16 size; > + __u16 pagecnt_bias; > #else > __u32 offset; > + __u32 pagecnt_bias; > #endif > - /* we maintain a pagecount bias, so that we dont dirty cache line > - * containing page->_refcount every time we allocate a fragment. > - */ > - unsigned int pagecnt_bias; > - bool pfmemalloc; > }; > > /* Track pages that require TLB flushes */ > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h > index 0a52f7a179c8..372d6ed7e20a 100644 > --- a/include/linux/page_frag_cache.h > +++ b/include/linux/page_frag_cache.h > @@ -3,18 +3,74 @@ > #ifndef _LINUX_PAGE_FRAG_CACHE_H > #define _LINUX_PAGE_FRAG_CACHE_H > > +#include <linux/bits.h> > +#include <linux/build_bug.h> > #include <linux/log2.h> > +#include <linux/mm.h> > #include <linux/mm_types_task.h> > #include <linux/types.h> > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) > +/* Use a full byte here to enable assembler optimization as the shift > + * operation is usually expecting a byte. > + */ > +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 8 > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT) > +#else > +/* Compiler should be able to figure out we don't read things as any value > + * ANDed with 0 is 0. > + */ > +#define PAGE_FRAG_CACHE_ORDER_MASK 0 > +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT 0 > +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT BIT(PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT) > +#endif > + > +static inline unsigned long page_frag_encode_page(struct page *page, > + unsigned int order, > + bool pfmemalloc) > +{ > + BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK); > + BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE); > + > + return (unsigned long)page_address(page) | > + (order & PAGE_FRAG_CACHE_ORDER_MASK) | > + ((unsigned long)pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT); > +} > + > +static inline unsigned long page_frag_encoded_page_order(unsigned long encoded_page) > +{ > + return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK; > +} > + > +static inline bool page_frag_encoded_page_pfmemalloc(unsigned long encoded_page) > +{ > + return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT); > +} > + > +static inline void *page_frag_encoded_page_address(unsigned long encoded_page) > +{ > + return (void *)(encoded_page & PAGE_MASK); > +} > + > +static inline struct page *page_frag_encoded_page_ptr(unsigned long encoded_page) > +{ > + return virt_to_page((void *)encoded_page); > +} > + > static inline void page_frag_cache_init(struct page_frag_cache *nc) > { > - nc->va = NULL; > + nc->encoded_page = 0; > } > > static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) > { > - return !!nc->pfmemalloc; > + return page_frag_encoded_page_pfmemalloc(nc->encoded_page); > +} > + > +static inline unsigned int page_frag_cache_page_size(unsigned long encoded_page) > +{ > + return PAGE_SIZE << page_frag_encoded_page_order(encoded_page); > } > > void page_frag_cache_drain(struct page_frag_cache *nc); So how many of these additions are actually needed outside of the page_frag_cache.c file? I'm just wondering if we could look at moving most of these into the c file itself instead of making them accessible to all callers as I don't believe we currently have anyone looking into the size of the frag cache or anything like that and I would prefer to avoid exposing such functionality if possible. As the non-order0 allocation problem with this has pointed out people will exploit any interface exposed even if unintentionally. I would want to move the size/order logic as well as splitting out the virtual address as we shouldn't be allowing the user to look at that without going through an allocation function.