Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue 12-06-12 16:13:36, Andi Kleen wrote:
> From: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> 
> There was some desire in large applications using MAP_HUGETLB/SHM_HUGETLB
> to use 1GB huge pages on some mappings, and stay with 2MB on others. This
> is useful together with NUMA policy: use 2MB interleaving on some mappings,
> but 1GB on local mappings.
> 
> This patch extends the IPC/SHM syscall interfaces slightly to allow specifying
> the page size.
> 
> It borrows some upper bits in the existing flag arguments and allows encoding
> the log of the desired page size in addition to the *_HUGETLB flag.
> When 0 is specified the default size is used, this makes the change fully
> compatible.
> 
> Extending the internal hugetlb code to handle this is straight forward. Instead
> of a single mount it just keeps an array of them and selects the right
> mount based on the specified page size.
> 
> I also exported the new flags to the user headers
> (they were previously under __KERNEL__). Right now only symbols
> for x86 and some other architecture for 1GB and 2MB are defined.
> The interface should already work for all other architectures
> though.

I like the idea!

Please note that some parts of the patch clash with hugetlb cgroup
controller[1] (CCing Aneesh) which made it into -mm tree recently.
Nothing big though.

---
[1] http://thread.gmane.org/gmane.linux.kernel.cgroups/2637 but there is
also -v9 out there but google doesn't seem to show it to me (that one is
just a bugfix release)

> 
> v2: Port to new tree. Fix unmount.
> Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> ---
>  arch/x86/include/asm/mman.h |    3 ++
>  fs/hugetlbfs/inode.c        |   62 ++++++++++++++++++++++++++++++++++---------
>  include/asm-generic/mman.h  |   13 +++++++++
>  include/linux/hugetlb.h     |   17 ++++++++++-
>  include/linux/shm.h         |   19 +++++++++++++
>  ipc/shm.c                   |    3 +-
>  mm/hugetlb.c                |    5 +---
>  mm/mmap.c                   |    5 ++-
>  8 files changed, 105 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
> index 593e51d..513b05f 100644
> --- a/arch/x86/include/asm/mman.h
> +++ b/arch/x86/include/asm/mman.h
> @@ -3,6 +3,9 @@
>  
>  #define MAP_32BIT	0x40		/* only give out 32bit addresses */
>  
> +#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
> +#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
> +
>  #include <asm-generic/mman.h>
>  
>  #endif /* _ASM_X86_MMAN_H */
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index cc9281b..b5b6a1d 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -920,16 +920,29 @@ static struct file_system_type hugetlbfs_fs_type = {
>  	.kill_sb	= kill_litter_super,
>  };
>  
> -static struct vfsmount *hugetlbfs_vfsmount;
> +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
>  
>  static int can_do_hugetlb_shm(void)
>  {
>  	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
>  }
>  
> +static int get_hstate_idx(int page_size_log)
> +{
> +	struct hstate *h;
> +
> +	if (!page_size_log)
> +		return default_hstate_idx;
> +	h = size_to_hstate(1 << page_size_log);
> +	if (!h)
> +		return -1;
> +	return h - hstates;
> +}
> +
>  struct file *hugetlb_file_setup(const char *name, unsigned long addr,
>  				size_t size, vm_flags_t acctflag,
> -				struct user_struct **user, int creat_flags)
> +				struct user_struct **user,
> +				int creat_flags, int page_size_log)
>  {
>  	int error = -ENOMEM;
>  	struct file *file;
> @@ -939,9 +952,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
>  	struct qstr quick_string;
>  	struct hstate *hstate;
>  	unsigned long num_pages;
> +	int hstate_idx;
> +
> +	hstate_idx = get_hstate_idx(page_size_log);
> +	if (hstate_idx < 0)
> +		return ERR_PTR(-ENODEV);
>  
>  	*user = NULL;
> -	if (!hugetlbfs_vfsmount)
> +	if (!hugetlbfs_vfsmount[hstate_idx])
>  		return ERR_PTR(-ENOENT);
>  
>  	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
> @@ -958,7 +976,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
>  		}
>  	}
>  
> -	root = hugetlbfs_vfsmount->mnt_root;
> +	root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
>  	quick_string.name = name;
>  	quick_string.len = strlen(quick_string.name);
>  	quick_string.hash = 0;
> @@ -966,7 +984,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
>  	if (!path.dentry)
>  		goto out_shm_unlock;
>  
> -	path.mnt = mntget(hugetlbfs_vfsmount);
> +	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
>  	error = -ENOSPC;
>  	inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
>  	if (!inode)
> @@ -1006,8 +1024,9 @@ out_shm_unlock:
>  
>  static int __init init_hugetlbfs_fs(void)
>  {
> +	struct hstate *h;
>  	int error;
> -	struct vfsmount *vfsmount;
> +	int i;
>  
>  	error = bdi_init(&hugetlbfs_backing_dev_info);
>  	if (error)
> @@ -1024,14 +1043,26 @@ static int __init init_hugetlbfs_fs(void)
>  	if (error)
>  		goto out;
>  
> -	vfsmount = kern_mount(&hugetlbfs_fs_type);
> +	i = 0;
> +	for_each_hstate (h) {
> +		char buf[50];
> +		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
>  
> -	if (!IS_ERR(vfsmount)) {
> -		hugetlbfs_vfsmount = vfsmount;
> -		return 0;
> -	}
> +		snprintf(buf, sizeof buf, "pagesize=%uK", ps_kb);
> +		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
> +							buf);
>  
> -	error = PTR_ERR(vfsmount);
> +		if (IS_ERR(hugetlbfs_vfsmount[i])) {
> +				pr_err(
> +			"hugetlb: Cannot mount internal hugetlbfs for page size %uK",
> +			       ps_kb);
> +			error = PTR_ERR(hugetlbfs_vfsmount[i]);
> +		}
> +		i++;
> +	}
> +	/* Non default hstates are optional */
> +	if (hugetlbfs_vfsmount[default_hstate_idx])
> +		return 0;
>  
>   out:
>  	kmem_cache_destroy(hugetlbfs_inode_cachep);
> @@ -1042,8 +1073,13 @@ static int __init init_hugetlbfs_fs(void)
>  
>  static void __exit exit_hugetlbfs_fs(void)
>  {
> +	struct hstate *h;
> +	int i;
> +
>  	kmem_cache_destroy(hugetlbfs_inode_cachep);
> -	kern_unmount(hugetlbfs_vfsmount);
> +	i = 0;
> +	for_each_hstate (h)
> +		kern_unmount(hugetlbfs_vfsmount[i++]);
>  	unregister_filesystem(&hugetlbfs_fs_type);
>  	bdi_destroy(&hugetlbfs_backing_dev_info);
>  }
> diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
> index 32c8bd6..d2f35d8 100644
> --- a/include/asm-generic/mman.h
> +++ b/include/asm-generic/mman.h
> @@ -13,6 +13,19 @@
>  #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
>  #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
>  
> +/* Bits [26:31] are reserved */
> +
> +/*
> + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
> + * This gives us 6 bits, which is enough until someone invents 128 bit address
> + * spaces.
> + *
> + * Assume these are all power of twos.
> + * When 0 use the default page size.
> + */
> +#define MAP_HUGE_SHIFT  26
> +#define MAP_HUGE_MASK   0x3f
> +
>  #define MCL_CURRENT	1		/* lock all current mappings */
>  #define MCL_FUTURE	2		/* lock all future mappings */
>  
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index d5d6bbe..78618a3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -154,7 +154,13 @@ extern const struct file_operations hugetlbfs_file_operations;
>  extern const struct vm_operations_struct hugetlb_vm_ops;
>  struct file *hugetlb_file_setup(const char *name, unsigned long addr,
>  				size_t size, vm_flags_t acct,
> -				struct user_struct **user, int creat_flags);
> +				struct user_struct **user, int creat_flags,
> +				int page_size_log);
> +int hugetlb_get_quota(struct address_space *mapping, long delta);
> +void hugetlb_put_quota(struct address_space *mapping, long delta);
> +
> +int hugetlb_get_quota(struct address_space *mapping, long delta);
> +void hugetlb_put_quota(struct address_space *mapping, long delta);
>  
>  static inline int is_file_hugepages(struct file *file)
>  {
> @@ -166,12 +172,19 @@ static inline int is_file_hugepages(struct file *file)
>  	return 0;
>  }
>  
> +
> +extern int max_hstate;
> +
> +#define for_each_hstate(h) \
> +	for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
> +
>  #else /* !CONFIG_HUGETLBFS */
>  
>  #define is_file_hugepages(file)			0
>  static inline struct file *
>  hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
> -		vm_flags_t acctflag, struct user_struct **user, int creat_flags)
> +		vm_flags_t acctflag, struct user_struct **user, int creat_flags,
> +		int page_size_log)
>  {
>  	return ERR_PTR(-ENOSYS);
>  }
> diff --git a/include/linux/shm.h b/include/linux/shm.h
> index 92808b8..41aa305 100644
> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -100,12 +100,31 @@ struct shmid_kernel /* private to the kernel */
>  	struct task_struct	*shm_creator;
>  };
>  
> +#endif
> +
>  /* shm_mode upper byte flags */
>  #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
>  #define SHM_LOCKED      02000   /* segment will not be swapped */
>  #define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
>  #define SHM_NORESERVE   010000  /* don't check for reservations */
>  
> +/* Bits [26:31] are reserved */
> +
> +/*
> + * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
> + * This gives us 6 bits, which is enough until someone invents 128 bit address
> + * spaces.
> + *
> + * Assume these are all power of twos.
> + * When 0 use the default page size.
> + */
> +#define SHM_HUGE_SHIFT  26
> +#define SHM_HUGE_MASK   0x3f
> +#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
> +#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
> +
> +#ifdef __KERNEL__
> +
>  #ifdef CONFIG_SYSVIPC
>  long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
>  extern int is_file_shm_hugepages(struct file *file);
> diff --git a/ipc/shm.c b/ipc/shm.c
> index 5e2cbfd..2515004 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -483,7 +483,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
>  		if (shmflg & SHM_NORESERVE)
>  			acctflag = VM_NORESERVE;
>  		file = hugetlb_file_setup(name, 0, size, acctflag,
> -					&shp->mlock_user, HUGETLB_SHMFS_INODE);
> +				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
> +				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
>  	} else {
>  		/*
>  		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e198831..bcae924 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -34,7 +34,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
>  static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
>  unsigned long hugepages_treat_as_movable;
>  
> -static int max_hstate;
> +int max_hstate;
>  unsigned int default_hstate_idx;
>  struct hstate hstates[HUGE_MAX_HSTATE];
>  
> @@ -45,9 +45,6 @@ static struct hstate * __initdata parsed_hstate;
>  static unsigned long __initdata default_hstate_max_huge_pages;
>  static unsigned long __initdata default_hstate_size;
>  
> -#define for_each_hstate(h) \
> -	for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
> -
>  /*
>   * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
>   */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3edfcdf..cc417ee 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1125,8 +1125,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
>  		 * memory so no accounting is necessary
>  		 */
>  		file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
> -						VM_NORESERVE, &user,
> -						HUGETLB_ANONHUGE_INODE);
> +				VM_NORESERVE,
> +				&user, HUGETLB_ANONHUGE_INODE,
> +				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
>  		if (IS_ERR(file))
>  			return PTR_ERR(file);
>  	}
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9    
Czech Republic

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]