For those this might concern, This patch works for me and is running in a big production environment for more then 2 weeks with no problems. Performance regression (observed after commit #911af505 in http://people.canonical.com/~inaddy/lp1328088/charts/{250,500,750, 1000,1250,1500,1750}.html) does not exist anymore. Changing locking mechanism from rcu_read_lock to task_lock makes network namespaces creation less sensible to _rcu implementation_ changes and fixed observed regression. Thank you very much. -Rafael On Jul 29, 2014, at 10:21 PM, Eric W. Biederman <ebiederm@xxxxxxxxxxxx> wrote: > > The synchronous syncrhonize_rcu in switch_task_namespaces makes setns > a sufficiently expensive system call that people have complained. > > Upon inspect nsproxy no longer needs rcu protection for remote reads. > remote reads are rare. So optimize for same process reads and write > by switching using rask_lock instead. > > This yields a simpler to understand lock, and a faster setns system call. > > In particular this fixes a performance regression observed > by Rafael David Tinoco <rafael.tinoco@xxxxxxxxxxxxx>. > > This is effectively a revert of Pavel Emelyanov's commit > cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter > from 2007. The race this originialy fixed no longer exists as > do_notify_parent uses task_active_pid_ns(parent) instead of > parent->nsproxy. > > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > --- > fs/namespace.c | 6 +++--- > fs/proc/proc_net.c | 4 +++- > fs/proc_namespace.c | 8 +++----- > include/linux/nsproxy.h | 16 ++++++---------- > ipc/namespace.c | 6 +++--- > kernel/nsproxy.c | 15 ++++----------- > kernel/utsname.c | 6 +++--- > net/core/net_namespace.c | 10 ++++++---- > 8 files changed, 31 insertions(+), 40 deletions(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index 182bc41cd887..7187d01329c3 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task) > struct mnt_namespace *ns = NULL; > struct nsproxy *nsproxy; > > - rcu_read_lock(); > - nsproxy = task_nsproxy(task); > + task_lock(task); > + nsproxy = task->nsproxy; > if (nsproxy) { > ns = nsproxy->mnt_ns; > get_mnt_ns(ns); > } > - rcu_read_unlock(); > + task_unlock(task); > > return ns; > } > diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c > index 4677bb7dc7c2..a63af3e0a612 100644 > --- a/fs/proc/proc_net.c > +++ b/fs/proc/proc_net.c > @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir) > rcu_read_lock(); > task = pid_task(proc_pid(dir), PIDTYPE_PID); > if (task != NULL) { > - ns = task_nsproxy(task); > + task_lock(task); > + ns = task->nsproxy; > if (ns != NULL) > net = get_net(ns->net_ns); > + task_unlock(task); > } > rcu_read_unlock(); > > diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c > index 1a81373947f3..73ca1740d839 100644 > --- a/fs/proc_namespace.c > +++ b/fs/proc_namespace.c > @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file, > if (!task) > goto err; > > - rcu_read_lock(); > - nsp = task_nsproxy(task); > + task_lock(task); > + nsp = task->nsproxy; > if (!nsp || !nsp->mnt_ns) { > - rcu_read_unlock(); > + task_unlock(task); > put_task_struct(task); > goto err; > } > ns = nsp->mnt_ns; > get_mnt_ns(ns); > - rcu_read_unlock(); > - task_lock(task); > if (!task->fs) { > task_unlock(task); > put_task_struct(task); > diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h > index b4ec59d159ac..35fa08fd7739 100644 > --- a/include/linux/nsproxy.h > +++ b/include/linux/nsproxy.h > @@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy; > * the namespaces access rules are: > * > * 1. only current task is allowed to change tsk->nsproxy pointer or > - * any pointer on the nsproxy itself > + * any pointer on the nsproxy itself. Current must hold the task_lock > + * when changing tsk->nsproxy. > * > * 2. when accessing (i.e. reading) current task's namespaces - no > * precautions should be taken - just dereference the pointers > * > * 3. the access to other task namespaces is performed like this > - * rcu_read_lock(); > - * nsproxy = task_nsproxy(tsk); > + * task_lock(task); > + * nsproxy = task->nsproxy; > * if (nsproxy != NULL) { > * / * > * * work with the namespaces here > * * e.g. get the reference on one of them > * * / > * } / * > - * * NULL task_nsproxy() means that this task is > + * * NULL task->nsproxy means that this task is > * * almost dead (zombie) > * * / > - * rcu_read_unlock(); > + * task_unlock(task); > * > */ > > -static inline struct nsproxy *task_nsproxy(struct task_struct *tsk) > -{ > - return rcu_dereference(tsk->nsproxy); > -} > - > int copy_namespaces(unsigned long flags, struct task_struct *tsk); > void exit_task_namespaces(struct task_struct *tsk); > void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); > diff --git a/ipc/namespace.c b/ipc/namespace.c > index 59451c1e214d..b54468e48e32 100644 > --- a/ipc/namespace.c > +++ b/ipc/namespace.c > @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task) > struct ipc_namespace *ns = NULL; > struct nsproxy *nsproxy; > > - rcu_read_lock(); > - nsproxy = task_nsproxy(task); > + task_lock(task); > + nsproxy = task->nsproxy; > if (nsproxy) > ns = get_ipc_ns(nsproxy->ipc_ns); > - rcu_read_unlock(); > + task_unlock(task); > > return ns; > } > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c > index 8e7811086b82..ef42d0ab3115 100644 > --- a/kernel/nsproxy.c > +++ b/kernel/nsproxy.c > @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) > > might_sleep(); > > + task_lock(p); > ns = p->nsproxy; > + p->nsproxy = new; > + task_unlock(p); > > - rcu_assign_pointer(p->nsproxy, new); > - > - if (ns && atomic_dec_and_test(&ns->count)) { > - /* > - * wait for others to get what they want from this nsproxy. > - * > - * cannot release this nsproxy via the call_rcu() since > - * put_mnt_ns() will want to sleep > - */ > - synchronize_rcu(); > + if (ns && atomic_dec_and_test(&ns->count)) > free_nsproxy(ns); > - } > } > > void exit_task_namespaces(struct task_struct *p) > diff --git a/kernel/utsname.c b/kernel/utsname.c > index fd393124e507..883aaaa7de8a 100644 > --- a/kernel/utsname.c > +++ b/kernel/utsname.c > @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) > struct uts_namespace *ns = NULL; > struct nsproxy *nsproxy; > > - rcu_read_lock(); > - nsproxy = task_nsproxy(task); > + task_lock(task); > + nsproxy = task->nsproxy; > if (nsproxy) { > ns = nsproxy->uts_ns; > get_uts_ns(ns); > } > - rcu_read_unlock(); > + task_unlock(task); > > return ns; > } > diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c > index 85b62691f4f2..7c6b51a58968 100644 > --- a/net/core/net_namespace.c > +++ b/net/core/net_namespace.c > @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid) > tsk = find_task_by_vpid(pid); > if (tsk) { > struct nsproxy *nsproxy; > - nsproxy = task_nsproxy(tsk); > + task_lock(tsk); > + nsproxy = tsk->nsproxy; > if (nsproxy) > net = get_net(nsproxy->net_ns); > + task_unlock(tsk); > } > rcu_read_unlock(); > return net; > @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task) > struct net *net = NULL; > struct nsproxy *nsproxy; > > - rcu_read_lock(); > - nsproxy = task_nsproxy(task); > + task_lock(task); > + nsproxy = task->nsproxy; > if (nsproxy) > net = get_net(nsproxy->net_ns); > - rcu_read_unlock(); > + task_unlock(task); > > return net; > } > -- > 1.9.1 > _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers