Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> writes: > The pid_revalidate() function requires dropping from RCU into REF lookup > mode. When many threads are resolving paths within /proc in parallel, > this can result in heavy spinlock contention as each thread tries to > grab a reference to the /proc dentry (and drop it shortly thereafter). > > Allow the pid_revalidate() function to execute under LOOKUP_RCU. When > updates must be made to the inode due to the owning task performing > setuid(), drop out of RCU and into REF mode. So rather than get_task_rcu_user. I think what we want is a function that verifies task->rcu_users > 0. Which frankly is just "pid_task(proc_pid(inode), PIDTYPE_PID)". Which is something that we can do unconditionally in pid_revalidate. Skipping the update of the inode is probably the only thing that needs to be skipped. It looks like the code can safely rely on the the security_task_to_inode in proc_pid_make_inode and remove the security_task_to_inode in pid_update_inode. > Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> > --- > > I'd like to use this patch as an RFC on this approach for reducing spinlock > contention during many parallel path lookups in the /proc filesystem. The > contention can be triggered by, for example, running ~100 parallel instances of > "TZ=/etc/localtime ps -fe >/dev/null" on a 100CPU machine. The %sys utilization > in such a case reaches around 90%, and profiles show two code paths with high > utilization: Do you have a real world work-load that is behaves something like this micro benchmark? I am just curious how severe the problem you are trying to solve is. > > walk_component > lookup_fast > unlazy_child > legitimize_root > __legitimize_path > lockref_get_not_dead > > terminate_walk > dput > dput > > By applying this patch, %sys utilization falls to around 60% under the same > workload. > > One item I'd like to highlight about this patch is that the > security_task_to_inode() hook is called less frequently as a result. I don't > know whether this is a major concern, which is why I've included security > reviewers as well. > > fs/proc/base.c | 50 ++++++++++++++++++++++++++++++++------------- > fs/proc/internal.h | 5 +++++ > include/linux/pid.h | 2 ++ > kernel/pid.c | 12 +++++++++++ > 4 files changed, 55 insertions(+), 14 deletions(-) > > diff --git a/fs/proc/base.c b/fs/proc/base.c > index ebea9501afb8..038056f94ed0 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -1813,12 +1813,29 @@ int pid_getattr(const struct path *path, struct kstat *stat, > /* > * Set <pid>/... inode ownership (can change due to setuid(), etc.) > */ > -void pid_update_inode(struct task_struct *task, struct inode *inode) > +static int do_pid_update_inode(struct task_struct *task, struct inode *inode, > + unsigned int flags) > { > - task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); > + kuid_t uid; > + kgid_t gid; > + > + task_dump_owner(task, inode->i_mode, &uid, &gid); > + if (uid_eq(uid, inode->i_uid) && gid_eq(gid, inode->i_gid) && > + !(inode->i_mode & (S_ISUID | S_ISGID))) > + return 1; > + if (flags & LOOKUP_RCU) > + return -ECHILD; > > + inode->i_uid = uid; > + inode->i_gid = gid; > inode->i_mode &= ~(S_ISUID | S_ISGID); > security_task_to_inode(task, inode); > + return 1; > +} > + > +void pid_update_inode(struct task_struct *task, struct inode *inode) > +{ > + do_pid_update_inode(task, inode, 0); > } > > /* > @@ -1830,19 +1847,24 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags) > { > struct inode *inode; > struct task_struct *task; > - > - if (flags & LOOKUP_RCU) > - return -ECHILD; > - > - inode = d_inode(dentry); > - task = get_proc_task(inode); > - > - if (task) { > - pid_update_inode(task, inode); > - put_task_struct(task); > - return 1; > + int rv = 0; > + > + if (flags & LOOKUP_RCU) { > + inode = d_inode_rcu(dentry); > + task = get_proc_task_rcu(inode); > + if (task) { > + rv = do_pid_update_inode(task, inode, flags); > + put_task_struct_rcu_user(task); > + } > + } else { > + inode = d_inode(dentry); > + task = get_proc_task(inode); > + if (task) { > + rv = do_pid_update_inode(task, inode, flags); > + put_task_struct(task); > + } > } > - return 0; > + return rv; > } > > static inline bool proc_inode_is_dead(struct inode *inode) > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index cd0c8d5ce9a1..aa6df65ad3eb 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -121,6 +121,11 @@ static inline struct task_struct *get_proc_task(const struct inode *inode) > return get_pid_task(proc_pid(inode), PIDTYPE_PID); > } > > +static inline struct task_struct *get_proc_task_rcu(const struct inode *inode) > +{ > + return get_pid_task_rcu_user(proc_pid(inode), PIDTYPE_PID); > +} > + > void task_dump_owner(struct task_struct *task, umode_t mode, > kuid_t *ruid, kgid_t *rgid); > > diff --git a/include/linux/pid.h b/include/linux/pid.h > index 9645b1194c98..0b2c54f85e6d 100644 > --- a/include/linux/pid.h > +++ b/include/linux/pid.h > @@ -86,6 +86,8 @@ static inline struct pid *get_pid(struct pid *pid) > extern void put_pid(struct pid *pid); > extern struct task_struct *pid_task(struct pid *pid, enum pid_type); > extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); > +extern struct task_struct *get_pid_task_rcu_user(struct pid *pid, > + enum pid_type type); > > extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); > > diff --git a/kernel/pid.c b/kernel/pid.c > index 0a9f2e437217..05acbd15cfa6 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -390,6 +390,18 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) > } > EXPORT_SYMBOL_GPL(get_pid_task); > > +struct task_struct *get_pid_task_rcu_user(struct pid *pid, enum pid_type type) > +{ > + struct task_struct *task; > + > + task = pid_task(pid, type); > + if (task && refcount_inc_not_zero(&task->rcu_users)) > + return task; > + > + return NULL; > +} > +EXPORT_SYMBOL_GPL(get_pid_task_rcu_user); > + > struct pid *find_get_pid(pid_t nr) > { > struct pid *pid;