There are several storage drivers like dm-multipath, iscsi, tcmu-runner, amd nbd that have userspace components that can run in the IO path. For example, iscsi and nbd's userspace deamons may need to recreate a socket and/or send IO on it, and dm-multipath's daemon multipathd may need to send IO to figure out the state of paths and re-set them up. In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the memalloc_*_save/restore functions to control the allocation behavior, but for userspace we would end up hitting a allocation that ended up writing data back to the same device we are trying to allocate for. This patch allows the userspace deamon to set the PF_MEMALLOC* flags with prctl during their initialization so later allocations cannot calling back into them. Signed-off-by: Mike Christie <mchristi@xxxxxxxxxx> --- V2: - Use prctl instead of procfs. - Add support for NOFS for fuse. - Check permissions. include/uapi/linux/prctl.h | 8 +++++++ kernel/sys.c | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7da1b37b27aa..6f6b3af6633a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -234,4 +234,12 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* Control reclaim behavior when allocating memory */ +#define PR_SET_MEMALLOC 57 +#define PR_GET_MEMALLOC 58 +#define PR_MEMALLOC_SET_NOIO (1UL << 0) +#define PR_MEMALLOC_CLEAR_NOIO (1UL << 1) +#define PR_MEMALLOC_SET_NOFS (1UL << 2) +#define PR_MEMALLOC_CLEAR_NOFS (1UL << 3) + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index a611d1d58c7d..34fedc9fc7e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2486,6 +2486,50 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; + case PR_SET_MEMALLOC: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg3 || arg4 || arg5) + return -EINVAL; + + switch (arg2) { + case PR_MEMALLOC_SET_NOIO: + if (current->flags & PF_MEMALLOC_NOFS) + return -EINVAL; + + current->flags |= PF_MEMALLOC_NOIO; + break; + case PR_MEMALLOC_CLEAR_NOIO: + current->flags &= ~PF_MEMALLOC_NOIO; + break; + case PR_MEMALLOC_SET_NOFS: + if (current->flags & PF_MEMALLOC_NOIO) + return -EINVAL; + + current->flags |= PF_MEMALLOC_NOFS; + break; + case PR_MEMALLOC_CLEAR_NOFS: + current->flags &= ~PF_MEMALLOC_NOFS; + break; + default: + return -EINVAL; + } + break; + case PR_GET_MEMALLOC: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + if (current->flags & PF_MEMALLOC_NOIO) + error = PR_MEMALLOC_SET_NOIO; + else if (current->flags & PF_MEMALLOC_NOFS) + error = PR_MEMALLOC_SET_NOFS; + else + error = 0; + break; default: error = -EINVAL; break; -- 2.20.1