The new fbind syscall sets the NUMA memory policy for file-backed memory and has following signature: long fbind(unsigned int fd, unsigned long mode, const unsigned long nodemask[(.maxnode + ULONG_WIDTH - 1) / ULONG_WIDTH], unsigned long maxnode, unsigned int flags); fbind behaves similar to mbind except that it takes file descriptor as input instead of address ranges. TODO: 1. Support fbind syscall on all architectures. 2. Expand commit msg and add documentation. 3. clean-up the code. [Shivansh: add create_mpol_from_args()] Signed-off-by: Shivansh Dhiman <shivansh.dhiman@xxxxxxx> Signed-off-by: Shivank Garg <shivankg@xxxxxxx> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/fs.h | 3 ++ include/linux/mempolicy.h | 3 ++ include/linux/syscalls.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 ++- kernel/sys_ni.c | 1 + mm/Makefile | 2 +- mm/fbind.c | 49 +++++++++++++++++++++++ mm/mempolicy.c | 55 ++++++++++++++++++++++++++ 10 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 mm/fbind.c diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 534c74b14fab..0660ce6d08d8 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -468,3 +468,4 @@ 460 i386 lsm_set_self_attr sys_lsm_set_self_attr 461 i386 lsm_list_modules sys_lsm_list_modules 462 i386 mseal sys_mseal +463 i386 fbind sys_fbind diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7093ee21c0d1..9794347cc2e6 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -386,6 +386,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common fbind sys_fbind # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..42042b62bdcd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,6 +2058,9 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); +#ifdef CONFIG_NUMA + int (*set_policy)(struct file *, struct mempolicy *); +#endif int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1add16f21612..b9023f6246a7 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -299,4 +299,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) } #endif /* CONFIG_NUMA */ +struct mempolicy *create_mpol_from_args(unsigned char mode, + const unsigned long __user *nmask, + unsigned short maxnode); #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4bcf6754738d..2dc686921b9f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -502,6 +502,9 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu asmlinkage long sys_newfstatat(int dfd, const char __user *filename, struct stat __user *statbuf, int flag); asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf); +asmlinkage long sys_fbind(unsigned int fd, unsigned long mode, + const unsigned long __user *nmask, + unsigned long maxnode, unsigned int flags); #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf); asmlinkage long sys_fstatat64(int dfd, const char __user *filename, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 5bf6148cac2b..550730f36dae 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -841,8 +841,11 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) #define __NR_mseal 462 __SYSCALL(__NR_mseal, sys_mseal) +#define __NR_fbind 463 +__SYSCALL(__NR_fbind, sys_fbind) + #undef __NR_syscalls -#define __NR_syscalls 463 +#define __NR_syscalls 464 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..f57350e581f6 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -195,6 +195,7 @@ COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(cachestat); COND_SYSCALL(mseal); +COND_SYSCALL(fbind); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/Makefile b/mm/Makefile index d2915f8c9dc0..ba339ddc0be2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -79,7 +79,7 @@ obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o -obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_NUMA) += mempolicy.o fbind.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o diff --git a/mm/fbind.c b/mm/fbind.c new file mode 100644 index 000000000000..85ec7d13345c --- /dev/null +++ b/mm/fbind.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implement fbind() syscall. + * + * Copyright (c) 2024 AMD + * + * Author: Shivank Garg <shivankg@xxxxxxx> + */ + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mempolicy.h> +#include <linux/syscalls.h> + +static long do_fbind(unsigned int fd, unsigned long mode, + const unsigned long __user *nmask, + unsigned long maxnode, unsigned int flags) +{ + struct mempolicy *mpol; + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + mpol = create_mpol_from_args(mode, nmask, maxnode); + if (IS_ERR_OR_NULL(mpol)) { + ret = PTR_ERR(mpol); + goto out_putf; + } + + if (f.file->f_op->set_policy) + ret = f.file->f_op->set_policy(f.file, mpol); + else + ret = -EOPNOTSUPP; + + mpol_put(mpol); +out_putf: + fdput(f); + return ret; +} + +SYSCALL_DEFINE5(fbind, unsigned int, fd, unsigned long, mode, + const unsigned long __user *, nmask, + unsigned long, maxnode, unsigned int, flags) +{ + return do_fbind(fd, mode, nmask, maxnode, flags); +} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b858e22b259d..3a697080ecad 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3557,3 +3557,58 @@ static int __init mempolicy_sysfs_init(void) late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */ + +/** + * create_mpol_from_args - create a mempolicy structure from args + * @mode: NUMA memory policy mode + * @nmask: bitmask of NUMA nodes + * @maxnode: number of bits in the nodes bitmask + * + * Create a mempolicy from given nodemask and memory policy such as + * default, preferred, interleave or bind. + * + * Return: error encoded in a pointer or memory policy on success. + */ +struct mempolicy *create_mpol_from_args(unsigned char mode, + const unsigned long __user *nmask, + unsigned short maxnode) +{ + struct mm_struct *mm = current->mm; + unsigned short mode_flags; + struct mempolicy *mpol; + nodemask_t nodes; + int lmode = mode; + int err = -ENOMEM; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return ERR_PTR(err); + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return ERR_PTR(err); + + mpol = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR_OR_NULL(mpol)) + return mpol; + + NODEMASK_SCRATCH(scratch); + if (!scratch) { + err = -ENOMEM; + goto err_out; + } + + mmap_write_lock(mm); + err = mpol_set_nodemask(mpol, &nodes, scratch); + mmap_write_unlock(mm); + NODEMASK_SCRATCH_FREE(scratch); + + if (err) + goto err_out; + + return mpol; + +err_out: + mpol_put(mpol); + return ERR_PTR(err); +} -- 2.34.1