The pid_revalidate() function requires dropping from RCU into REF lookup mode. When many threads are resolving paths within /proc in parallel, this can result in heavy spinlock contention as each thread tries to grab a reference to the /proc dentry (and drop it shortly thereafter). Allow the pid_revalidate() function to execute under LOOKUP_RCU. When updates must be made to the inode due to the owning task performing setuid(), drop out of RCU and into REF mode. Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> --- I'd like to use this patch as an RFC on this approach for reducing spinlock contention during many parallel path lookups in the /proc filesystem. The contention can be triggered by, for example, running ~100 parallel instances of "TZ=/etc/localtime ps -fe >/dev/null" on a 100CPU machine. The %sys utilization in such a case reaches around 90%, and profiles show two code paths with high utilization: walk_component lookup_fast unlazy_child legitimize_root __legitimize_path lockref_get_not_dead terminate_walk dput dput By applying this patch, %sys utilization falls to around 60% under the same workload. One item I'd like to highlight about this patch is that the security_task_to_inode() hook is called less frequently as a result. I don't know whether this is a major concern, which is why I've included security reviewers as well. fs/proc/base.c | 50 ++++++++++++++++++++++++++++++++------------- fs/proc/internal.h | 5 +++++ include/linux/pid.h | 2 ++ kernel/pid.c | 12 +++++++++++ 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..038056f94ed0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1813,12 +1813,29 @@ int pid_getattr(const struct path *path, struct kstat *stat, /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ -void pid_update_inode(struct task_struct *task, struct inode *inode) +static int do_pid_update_inode(struct task_struct *task, struct inode *inode, + unsigned int flags) { - task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + kuid_t uid; + kgid_t gid; + + task_dump_owner(task, inode->i_mode, &uid, &gid); + if (uid_eq(uid, inode->i_uid) && gid_eq(gid, inode->i_gid) && + !(inode->i_mode & (S_ISUID | S_ISGID))) + return 1; + if (flags & LOOKUP_RCU) + return -ECHILD; + inode->i_uid = uid; + inode->i_gid = gid; inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); + return 1; +} + +void pid_update_inode(struct task_struct *task, struct inode *inode) +{ + do_pid_update_inode(task, inode, 0); } /* @@ -1830,19 +1847,24 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = d_inode(dentry); - task = get_proc_task(inode); - - if (task) { - pid_update_inode(task, inode); - put_task_struct(task); - return 1; + int rv = 0; + + if (flags & LOOKUP_RCU) { + inode = d_inode_rcu(dentry); + task = get_proc_task_rcu(inode); + if (task) { + rv = do_pid_update_inode(task, inode, flags); + put_task_struct_rcu_user(task); + } + } else { + inode = d_inode(dentry); + task = get_proc_task(inode); + if (task) { + rv = do_pid_update_inode(task, inode, flags); + put_task_struct(task); + } } - return 0; + return rv; } static inline bool proc_inode_is_dead(struct inode *inode) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index cd0c8d5ce9a1..aa6df65ad3eb 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -121,6 +121,11 @@ static inline struct task_struct *get_proc_task(const struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } +static inline struct task_struct *get_proc_task_rcu(const struct inode *inode) +{ + return get_pid_task_rcu_user(proc_pid(inode), PIDTYPE_PID); +} + void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); diff --git a/include/linux/pid.h b/include/linux/pid.h index 9645b1194c98..0b2c54f85e6d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -86,6 +86,8 @@ static inline struct pid *get_pid(struct pid *pid) extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); +extern struct task_struct *get_pid_task_rcu_user(struct pid *pid, + enum pid_type type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); diff --git a/kernel/pid.c b/kernel/pid.c index 0a9f2e437217..05acbd15cfa6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -390,6 +390,18 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) } EXPORT_SYMBOL_GPL(get_pid_task); +struct task_struct *get_pid_task_rcu_user(struct pid *pid, enum pid_type type) +{ + struct task_struct *task; + + task = pid_task(pid, type); + if (task && refcount_inc_not_zero(&task->rcu_users)) + return task; + + return NULL; +} +EXPORT_SYMBOL_GPL(get_pid_task_rcu_user); + struct pid *find_get_pid(pid_t nr) { struct pid *pid; -- 2.25.1