Manfred Spraul <manfred@xxxxxxxxxxxxxxxx> writes: > Hello together, > > On 11/5/21 22:34, Eric W. Biederman wrote: >> +static inline void shm_clist_del(struct shmid_kernel *shp) >> +{ >> + struct task_struct *creator; >> + >> + rcu_read_lock(); >> + creator = rcu_dereference(shp->shm_creator); >> + if (creator) { >> + task_lock(creator); >> + list_del(&shp->shm_clist); >> + task_unlock(creator); >> + } >> + rcu_read_unlock(); >> +} >> + > > shm_clist_del() only synchronizes against exit_shm() when shm_creator > is not NULL. > > >> + list_del(&shp->shm_clist); >> + rcu_assign_pointer(shp->shm_creator, NULL); >> + > > We set shm_creator to NULL -> no more synchronization. > > Now IPC_RMID can run in parallel - regardless if we test for > list_empty() or shm_creator. > >> + >> + /* Guarantee shp lives after task_lock is dropped */ >> + ipc_getref(&shp->shm_perm); >> + > > task_lock() doesn't help: As soon as shm_creator is set to NULL, > IPC_RMID won't acquire task_lock() anymore. > > Thus shp can disappear before we arrive at this ipc_getref. > > [Yes, I think I have introduced this bug. ] > > Corrected version attached. > > > I'll reboot and retest the patch, then I would send it to akpm as > replacement for current patch in mmotm. > > -- > > Manfred > > @@ -382,48 +425,94 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) > /* Locking assumes this will only be called with task == current */ > void exit_shm(struct task_struct *task) > { > - struct ipc_namespace *ns = task->nsproxy->ipc_ns; > - struct shmid_kernel *shp, *n; > + for (;;) { > + struct shmid_kernel *shp; > + struct ipc_namespace *ns; > > - if (list_empty(&task->sysvshm.shm_clist)) > - return; > + task_lock(task); > + > + if (list_empty(&task->sysvshm.shm_clist)) { > + task_unlock(task); > + break; > + } > + > + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, > + shm_clist); > > - /* > - * If kernel.shm_rmid_forced is not set then only keep track of > - * which shmids are orphaned, so that a later set of the sysctl > - * can clean them up. > - */ > - if (!ns->shm_rmid_forced) { > - down_read(&shm_ids(ns).rwsem); > - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) > - shp->shm_creator = NULL; > /* > - * Only under read lock but we are only called on current > - * so no entry on the list will be shared. > + * 1) get a reference to shp. > + * This must be done first: Right now, task_lock() prevents > + * any concurrent IPC_RMID calls. After the list_del_init(), > + * IPC_RMID will not acquire task_lock(->shm_creator) > + * anymore. > */ > - list_del(&task->sysvshm.shm_clist); > - up_read(&shm_ids(ns).rwsem); > - return; > - } > + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); > > - /* > - * Destroy all already created segments, that were not yet mapped, > - * and mark any mapped as orphan to cover the sysctl toggling. > - * Destroy is skipped if shm_may_destroy() returns false. > - */ > - down_write(&shm_ids(ns).rwsem); > - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { > - shp->shm_creator = NULL; > + /* 2) unlink */ > + list_del_init(&shp->shm_clist); > + > + /* > + * 3) Get pointer to the ipc namespace. It is worth to say > + * that this pointer is guaranteed to be valid because > + * shp lifetime is always shorter than namespace lifetime > + * in which shp lives. > + * We taken task_lock it means that shp won't be freed. > + */ > + ns = shp->ns; > > - if (shm_may_destroy(ns, shp)) { > - shm_lock_by_ptr(shp); > - shm_destroy(ns, shp); > + /* > + * 4) If kernel.shm_rmid_forced is not set then only keep track of > + * which shmids are orphaned, so that a later set of the sysctl > + * can clean them up. > + */ > + if (!ns->shm_rmid_forced) { > + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); > + task_unlock(task); > + continue; > } > - } > > - /* Remove the list head from any segments still attached. */ > - list_del(&task->sysvshm.shm_clist); > - up_write(&shm_ids(ns).rwsem); > + /* > + * 5) get a reference to the namespace. > + * The refcount could be already 0. If it is 0, then > + * the shm objects will be free by free_ipc_work(). > + */ > + ns = get_ipc_ns_not_zero(ns); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Isn't this increment also too late? Doesn't this need to move up by ipc_rcu_getref while shp is still on the list? Assuming the code is running in parallel with shm_exit_ns after removal from shm_clist shm_destroy can run to completion and shm_exit_ns can run to completion and the ipc namespace can be freed. Eric