On Fri, Mar 01, 2024 at 10:54:16AM -0500, rulinhuang wrote: > Moved data structures and basic helpers related to per cpu kva allocator > up too to along with these macros with no functional change happened. > > Signed-off-by: rulinhuang <rulin.huang@xxxxxxxxx> > --- > V6 -> V7: Adjusted the macros > --- > mm/vmalloc.c | 262 +++++++++++++++++++++++++-------------------------- > 1 file changed, 131 insertions(+), 131 deletions(-) > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 25a8df497255..fc027a61c12e 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -887,6 +887,137 @@ is_vn_id_valid(unsigned int node_id) > return false; > } > > +/* > + * vmap space is limited especially on 32 bit architectures. Ensure there is > + * room for at least 16 percpu vmap blocks per CPU. > + */ > +/* > + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able > + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess > + * instead (we just need a rough idea) > + */ > +#if BITS_PER_LONG == 32 > +#define VMALLOC_SPACE (128UL*1024*1024) > +#else > +#define VMALLOC_SPACE (128UL*1024*1024*1024) > +#endif > + > +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) > +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ > +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ > +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) > +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ > +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ > +#define VMAP_BBMAP_BITS \ > + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ > + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ > + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) > + > +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) > + > +/* > + * Purge threshold to prevent overeager purging of fragmented blocks for > + * regular operations: Purge if vb->free is less than 1/4 of the capacity. > + */ > +#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) > + > +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ > +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ > +#define VMAP_FLAGS_MASK 0x3 > + > +struct vmap_block_queue { > + spinlock_t lock; > + struct list_head free; > + > + /* > + * An xarray requires an extra memory dynamically to > + * be allocated. If it is an issue, we can use rb-tree > + * instead. > + */ > + struct xarray vmap_blocks; > +}; > + > +struct vmap_block { > + spinlock_t lock; > + struct vmap_area *va; > + unsigned long free, dirty; > + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); > + unsigned long dirty_min, dirty_max; /*< dirty range */ > + struct list_head free_list; > + struct rcu_head rcu_head; > + struct list_head purge; > +}; > + > +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ > +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); > + > +/* > + * In order to fast access to any "vmap_block" associated with a > + * specific address, we use a hash. > + * > + * A per-cpu vmap_block_queue is used in both ways, to serialize > + * an access to free block chains among CPUs(alloc path) and it > + * also acts as a vmap_block hash(alloc/free paths). It means we > + * overload it, since we already have the per-cpu array which is > + * used as a hash table. When used as a hash a 'cpu' passed to > + * per_cpu() is not actually a CPU but rather a hash index. > + * > + * A hash function is addr_to_vb_xa() which hashes any address > + * to a specific index(in a hash) it belongs to. This then uses a > + * per_cpu() macro to access an array with generated index. > + * > + * An example: > + * > + * CPU_1 CPU_2 CPU_0 > + * | | | > + * V V V > + * 0 10 20 30 40 50 60 > + * |------|------|------|------|------|------|...<vmap address space> > + * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 > + * > + * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus > + * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; > + * > + * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus > + * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; > + * > + * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus > + * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. > + * > + * This technique almost always avoids lock contention on insert/remove, > + * however xarray spinlocks protect against any contention that remains. > + */ > +static struct xarray * > +addr_to_vb_xa(unsigned long addr) > +{ > + int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); > + > + return &per_cpu(vmap_block_queue, index).vmap_blocks; > +} > + > +/* > + * We should probably have a fallback mechanism to allocate virtual memory > + * out of partially filled vmap blocks. However vmap block sizing should be > + * fairly reasonable according to the vmalloc size, so it shouldn't be a > + * big problem. > + */ > + > +static unsigned long addr_to_vb_idx(unsigned long addr) > +{ > + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); > + addr /= VMAP_BLOCK_SIZE; > + return addr; > +} > + > +static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) > +{ > + unsigned long addr; > + > + addr = va_start + (pages_off << PAGE_SHIFT); > + BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); > + return (void *)addr; > +} > + > static __always_inline unsigned long > va_size(struct vmap_area *va) > { > @@ -2327,137 +2458,6 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) > > /*** Per cpu kva allocator ***/ > > -/* > - * vmap space is limited especially on 32 bit architectures. Ensure there is > - * room for at least 16 percpu vmap blocks per CPU. > - */ > -/* > - * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able > - * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess > - * instead (we just need a rough idea) > - */ > -#if BITS_PER_LONG == 32 > -#define VMALLOC_SPACE (128UL*1024*1024) > -#else > -#define VMALLOC_SPACE (128UL*1024*1024*1024) > -#endif > - > -#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) > -#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ > -#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ > -#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) > -#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ > -#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ > -#define VMAP_BBMAP_BITS \ > - VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ > - VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ > - VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) > - > -#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) > - > -/* > - * Purge threshold to prevent overeager purging of fragmented blocks for > - * regular operations: Purge if vb->free is less than 1/4 of the capacity. > - */ > -#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) > - > -#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ > -#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ > -#define VMAP_FLAGS_MASK 0x3 > - > -struct vmap_block_queue { > - spinlock_t lock; > - struct list_head free; > - > - /* > - * An xarray requires an extra memory dynamically to > - * be allocated. If it is an issue, we can use rb-tree > - * instead. > - */ > - struct xarray vmap_blocks; > -}; > - > -struct vmap_block { > - spinlock_t lock; > - struct vmap_area *va; > - unsigned long free, dirty; > - DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); > - unsigned long dirty_min, dirty_max; /*< dirty range */ > - struct list_head free_list; > - struct rcu_head rcu_head; > - struct list_head purge; > -}; > - > -/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ > -static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); > - > -/* > - * In order to fast access to any "vmap_block" associated with a > - * specific address, we use a hash. > - * > - * A per-cpu vmap_block_queue is used in both ways, to serialize > - * an access to free block chains among CPUs(alloc path) and it > - * also acts as a vmap_block hash(alloc/free paths). It means we > - * overload it, since we already have the per-cpu array which is > - * used as a hash table. When used as a hash a 'cpu' passed to > - * per_cpu() is not actually a CPU but rather a hash index. > - * > - * A hash function is addr_to_vb_xa() which hashes any address > - * to a specific index(in a hash) it belongs to. This then uses a > - * per_cpu() macro to access an array with generated index. > - * > - * An example: > - * > - * CPU_1 CPU_2 CPU_0 > - * | | | > - * V V V > - * 0 10 20 30 40 50 60 > - * |------|------|------|------|------|------|...<vmap address space> > - * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 > - * > - * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus > - * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; > - * > - * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus > - * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; > - * > - * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus > - * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. > - * > - * This technique almost always avoids lock contention on insert/remove, > - * however xarray spinlocks protect against any contention that remains. > - */ > -static struct xarray * > -addr_to_vb_xa(unsigned long addr) > -{ > - int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); > - > - return &per_cpu(vmap_block_queue, index).vmap_blocks; > -} > - > -/* > - * We should probably have a fallback mechanism to allocate virtual memory > - * out of partially filled vmap blocks. However vmap block sizing should be > - * fairly reasonable according to the vmalloc size, so it shouldn't be a > - * big problem. > - */ > - > -static unsigned long addr_to_vb_idx(unsigned long addr) > -{ > - addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); > - addr /= VMAP_BLOCK_SIZE; > - return addr; > -} > - > -static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) > -{ > - unsigned long addr; > - > - addr = va_start + (pages_off << PAGE_SHIFT); > - BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); > - return (void *)addr; > -} > - > /** > * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this > * block. Of course pages number can't exceed VMAP_BBMAP_BITS > -- > 2.43.0 > Sorry for the late answer, i also just noticed this email. It was not in my inbox... OK, now you move part of the per-cpu allocator on the top and leave another part down making it split. This is just for the: BUG_ON(va_flags & VMAP_RAM); VMAP_RAM macro. Do we really need this BUG_ON()? -- Uladzislau Rezki