[RFC] shm: extend forced shm destroy to support objects from several IPC nses (simplified)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I have to dash so this is short.

This is what I am thinking this change should look like.

I am not certain this is truly reviewable as a single change, so I will
break it into a couple of smaller ones next time I get the chance.

Eric

 include/linux/ipc_namespace.h |  12 ++++
 include/linux/sched/task.h    |   2 +-
 ipc/shm.c                     | 135 +++++++++++++++++++++++++-----------------
 ipc/util.c                    |   5 ++
 ipc/util.h                    |   1 +
 kernel/fork.c                 |   1 -
 6 files changed, 100 insertions(+), 56 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 05e22770af51..c220767a0cc1 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -131,6 +131,13 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 	return ns;
 }
 
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+	if (ns && refcount_inc_not_zero(&ns->ns.count))
+		return ns;
+	return NULL;
+}
+
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -147,6 +154,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 	return ns;
 }
 
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+	return ns;
+}
+
 static inline void put_ipc_ns(struct ipc_namespace *ns)
 {
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf2..1d9533d66f7e 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -157,7 +157,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
+ * ->cgroup.subsys[]. And ->vfork_done. And ->shmvshm.shm_clist.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/ipc/shm.c b/ipc/shm.c
index ab749be6d8b7..80e3595d3a69 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -63,8 +63,9 @@ struct shmid_kernel /* private to the kernel */
 	struct ucounts		*mlock_ucounts;
 
 	/* The task created the shm object.  NULL if the task is dead. */
-	struct task_struct	*shm_creator;
+	struct task_struct __rcu *shm_creator;
 	struct list_head	shm_clist;	/* list by creator */
+	struct ipc_namespace	*shm_ns;	/* valid when shm_nattch != 0 */
 } __randomize_layout;
 
 /* shm_mode upper byte flags */
@@ -106,29 +107,17 @@ void shm_init_ns(struct ipc_namespace *ns)
 	ipc_init_ids(&shm_ids(ns));
 }
 
-/*
- * Called with shm_ids.rwsem (writer) and the shp structure locked.
- * Only shm_ids.rwsem remains locked on exit.
- */
-static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+static void do_shm_destroy(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
-	struct shmid_kernel *shp;
-
-	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
-
-	if (shp->shm_nattch) {
-		shp->shm_perm.mode |= SHM_DEST;
-		/* Do not find it any more */
-		ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
-		shm_unlock(shp);
-	} else
-		shm_destroy(ns, shp);
+	struct shmid_kernel *shp =
+		container_of(ipcp, struct shmid_kernel, shm_perm);
+	shm_destroy(ns, shp);
 }
 
 #ifdef CONFIG_IPC_NS
 void shm_exit_ns(struct ipc_namespace *ns)
 {
-	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
+	free_ipcs(ns, &shm_ids(ns), do_shm_destroy);
 	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
 	rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
 }
@@ -225,9 +214,22 @@ static void shm_rcu_free(struct rcu_head *head)
 	kfree(shp);
 }
 
+static inline void shm_clist_del(struct shmid_kernel *shp)
+{
+	struct task_struct *creator;
+
+	rcu_read_lock();
+	creator = rcu_dereference(shp->shm_creator);
+	if (creator) {
+		task_lock(creator);
+		list_del(&shp->shm_clist);
+		task_unlock(creator);
+	}
+	rcu_read_unlock();
+}
+
 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 {
-	list_del(&s->shm_clist);
 	ipc_rmid(&shm_ids(ns), &s->shm_perm);
 }
 
@@ -283,7 +285,9 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 	shm_file = shp->shm_file;
 	shp->shm_file = NULL;
 	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	shm_clist_del(shp);
 	shm_rmid(ns, shp);
+	shp->shm_ns = NULL;
 	shm_unlock(shp);
 	if (!is_file_hugepages(shm_file))
 		shmem_lock(shm_file, 0, shp->mlock_ucounts);
@@ -361,7 +365,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
 	 *
 	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
 	 */
-	if (shp->shm_creator != NULL)
+	if (rcu_access_pointer(shp->shm_creator) != NULL)
 		return 0;
 
 	if (shm_may_destroy(ns, shp)) {
@@ -382,48 +386,62 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
 /* Locking assumes this will only be called with task == current */
 void exit_shm(struct task_struct *task)
 {
-	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
-	struct shmid_kernel *shp, *n;
-
-	if (list_empty(&task->sysvshm.shm_clist))
-		return;
-
-	/*
-	 * If kernel.shm_rmid_forced is not set then only keep track of
-	 * which shmids are orphaned, so that a later set of the sysctl
-	 * can clean them up.
-	 */
-	if (!ns->shm_rmid_forced) {
-		down_read(&shm_ids(ns).rwsem);
-		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
-			shp->shm_creator = NULL;
-		/*
-		 * Only under read lock but we are only called on current
-		 * so no entry on the list will be shared.
-		 */
-		list_del(&task->sysvshm.shm_clist);
-		up_read(&shm_ids(ns).rwsem);
-		return;
-	}
+	struct list_head *head = &task->sysvshm.shm_clist;
 
 	/*
 	 * Destroy all already created segments, that were not yet mapped,
 	 * and mark any mapped as orphan to cover the sysctl toggling.
 	 * Destroy is skipped if shm_may_destroy() returns false.
 	 */
-	down_write(&shm_ids(ns).rwsem);
-	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
-		shp->shm_creator = NULL;
+	for (;;) {
+		struct ipc_namespace *ns;
+		struct shmid_kernel *shp;
 
-		if (shm_may_destroy(ns, shp)) {
+		task_lock(task);
+		if (list_empty(head)) {
+			task_unlock(task);
+			break;
+		}
+
+		shp = list_first_entry(head, struct shmid_kernel, shm_clist);
+
+		list_del(&shp->shm_clist);
+		rcu_assign_pointer(shp->shm_creator, NULL);
+
+		/*
+		 * Guarantee that ns lives after task_list is dropped.
+		 *
+		 * This shm segment may not be attached and it's ipc
+		 * namespace may be exiting.  If so ignore the shm
+		 * segment as it will be destroyed by shm_exit_ns.
+		 */
+		ns = get_ipc_ns_not_zero(shp->shm_ns);
+		if (!ns) {
+			task_unlock(task);
+			continue;
+		}
+
+		/* Guarantee shp lives after task_lock is dropped */
+		ipc_getref(&shp->shm_perm);
+
+		/* Drop task_lock so that shm_destroy may take it */
+		task_unlock(task);
+
+		/* Can the shm segment be destroyed? */
+		down_write(&shm_ids(ns).rwsem);
+		shm_lock_by_ptr(shp);
+		if (ipc_valid_object(&shp->shm_perm) &&
+		    shm_may_destroy(ns, shp)) {
 			shm_lock_by_ptr(shp);
 			shm_destroy(ns, shp);
+		} else {
+			shm_unlock(shp);
 		}
-	}
 
-	/* Remove the list head from any segments still attached. */
-	list_del(&task->sysvshm.shm_clist);
-	up_write(&shm_ids(ns).rwsem);
+		ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+		up_write(&shm_ids(ns).rwsem);
+		put_ipc_ns(ns);
+	}
 }
 
 static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -673,14 +691,17 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	shp->shm_segsz = size;
 	shp->shm_nattch = 0;
 	shp->shm_file = file;
-	shp->shm_creator = current;
+	RCU_INIT_POINTER(shp->shm_creator, current);
+	shp->shm_ns = ns;
 
 	/* ipc_addid() locks shp upon success. */
 	error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (error < 0)
 		goto no_id;
 
+	task_lock(current);
 	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
+	task_unlock(current);
 
 	/*
 	 * shmid gets reported as "inode#" in /proc/pid/maps.
@@ -913,8 +934,14 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 	switch (cmd) {
 	case IPC_RMID:
 		ipc_lock_object(&shp->shm_perm);
-		/* do_shm_rmid unlocks the ipc object and rcu */
-		do_shm_rmid(ns, ipcp);
+		if (shp->shm_nattch) {
+			shp->shm_perm.mode |= SHM_DEST;
+			/* Do not find it any more */
+			ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
+			shm_unlock(shp);
+		} else
+			shm_destroy(ns, shp);
+		/* shm_unlock unlocked the ipc object and rcu */
 		goto out_up;
 	case IPC_SET:
 		ipc_lock_object(&shp->shm_perm);
diff --git a/ipc/util.c b/ipc/util.c
index fa2d86ef3fb8..58228f342397 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -525,6 +525,11 @@ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 	ipcp->key = IPC_PRIVATE;
 }
 
+void ipc_getref(struct kern_ipc_perm *ptr)
+{
+	return refcount_inc(&ptr->refcount);
+}
+
 bool ipc_rcu_getref(struct kern_ipc_perm *ptr)
 {
 	return refcount_inc_not_zero(&ptr->refcount);
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..e13b46ff675f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -170,6 +170,7 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
  * refcount is initialized by ipc_addid(), before that point call_rcu()
  * must be used.
  */
+void ipc_getref(struct kern_ipc_perm *ptr);
 bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
 void ipc_rcu_putref(struct kern_ipc_perm *ptr,
 			void (*func)(struct rcu_head *head));
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..3e881f78bcf2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3095,7 +3095,6 @@ int ksys_unshare(unsigned long unshare_flags)
 		if (unshare_flags & CLONE_NEWIPC) {
 			/* Orphan segments in old ns (see sem above). */
 			exit_shm(current);
-			shm_init_task(current);
 		}
 
 		if (new_nsproxy)




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux