Re: [PATCH v5 03/11] tracing/user_events: Use remote writes for event enablement

Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> · Mon, 5 Dec 2022 16:28:03 -0500

On 2022-12-05 16:00, Beau Belgrave wrote:
[...]

  #ifdef CONFIG_USER_EVENTS
  struct user_event_mm {
+	struct list_head link;
+	struct list_head enablers;
+	struct mm_struct *mm;
+	struct user_event_mm *next;
+	refcount_t refcnt;
+	refcount_t tasks;
  };
-#endif

+extern void user_event_mm_dup(struct task_struct *t,

+			      struct user_event_mm *old_mm);
+
+extern void user_event_mm_remove(struct task_struct *t);
+
+static inline void user_events_fork(struct task_struct *t,
+				    unsigned long clone_flags)
+{
+	struct user_event_mm *old_mm;
+
+	if (!t || !current->user_event_mm)
+		return;
+
+	old_mm = current->user_event_mm;
+
+	if (clone_flags & CLONE_VM) {
+		t->user_event_mm = old_mm;
+		refcount_inc(&old_mm->tasks);
+		return;
+	}
+
+	user_event_mm_dup(t, old_mm);
+}
+
+static inline void user_events_execve(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+	if (!t || !t->user_event_mm)
+		return;
+
+	user_event_mm_remove(t);
+}

So this is adding user_event_mm_remove() calls on each execve and each 

process exit, correct ?

[...]

+
+void user_event_mm_remove(struct task_struct *t)
+{
+	struct user_event_mm *mm;
+	unsigned long flags;
+
+	might_sleep();
+
+	mm = t->user_event_mm;
+	t->user_event_mm = NULL;
+
+	/* Clone will increment the tasks, only remove if last clone */
+	if (!refcount_dec_and_test(&mm->tasks))
+		return;
+
+	/* Remove the mm from the list, so it can no longer be enabled */
+	spin_lock_irqsave(&user_event_mms_lock, flags);
+	list_del_rcu(&mm->link);
+	spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+	/*
+	 * Put for mm must be done after RCU sync to handle new refs in
+	 * between the list_del_rcu() and now. This ensures any get refs
+	 * during rcu_read_lock() are accounted for during list removal.
+	 *
+	 * CPU A			|	CPU B
+	 * ---------------------------------------------------------------
+	 * user_event_mm_remove()	|	rcu_read_lock();
+	 * list_del_rcu()		|	list_for_each_entry_rcu();
+	 * synchronize_rcu()		|	refcount_inc();
+	 * .				|	rcu_read_unlock();
+	 * user_event_mm_put()		|	.
+	 */
+	synchronize_rcu();

This means a synchronize_rcu() is added on each execve and each process 

exit ? I am really worried about the performance impact of this big 

hammer synchronization in those key points of process lifetime.

Thanks,

Mathieu

+
+	/*
+	 * We need to wait for currently occurring writes to stop within
+	 * the mm. This is required since exit_mm() snaps the current rss
+	 * stats and clears them. On the final mmdrop(), check_mm() will
+	 * report a bug if these increment.
+	 *
+	 * All writes/pins are done under mmap_read lock, take the write
+	 * lock to ensure in-progress faults have completed. Faults that
+	 * are pending but yet to run will check the task count and skip
+	 * the fault since the mm is going away.
+	 */
+	mmap_write_lock(mm->mm);
+	mmap_write_unlock(mm->mm);
+
+	/* MM is still alive, but won't be updated anymore */
+	user_event_mm_put(mm);
+}
+
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
  {
-	int i = user->index;
+	struct user_event_mm *mm = user_event_mm_create(t);
+	struct user_event_enabler *enabler;
+
+	if (!mm)
+		return;
+
+	rcu_read_lock();

-	user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);

+	list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+		if (!user_event_enabler_dup(enabler, mm))
+			goto error;
+
+	rcu_read_unlock();
+
+	return;
+error:
+	rcu_read_unlock();
+	user_event_mm_remove(t);
  }

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com