On Tue, Jan 23, 2024 at 3:10 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote: > > With maple_tree supporting vma tree traversal under RCU and per-vma locks > making vma access RCU-safe, /proc/pid/maps can be read under RCU and > without the need to read-lock mmap_lock. However vma content can change > from under us, therefore we make a copy of the vma and we pin pointer > fields used when generating the output (currently only vm_file and > anon_name). Afterwards we check for concurrent address space > modifications, wait for them to end and retry. That last check is needed > to avoid possibility of missing a vma during concurrent maple_tree > node replacement, which might report a NULL when a vma is replaced > with another one. While we take the mmap_lock for reading during such > contention, we do that momentarily only to record new mm_wr_seq counter. > This change is designed to reduce mmap_lock contention and prevent a > process reading /proc/pid/maps files (often a low priority task, such as > monitoring/data collection services) from blocking address space updates. > > Note that this change has a userspace visible disadvantage: it allows for > sub-page data tearing as opposed to the previous mechanism where data > tearing could happen only between pages of generated output data. > Since current userspace considers data tearing between pages to be > acceptable, we assume is will be able to handle sub-page data tearing > as well. > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > --- > Changes since v1 [1]: > - Fixed CONFIG_ANON_VMA_NAME=n build by introducing > anon_vma_name_{get|put}_if_valid, per SeongJae Park > - Fixed misspelling of get_vma_snapshot() > > [1] https://lore.kernel.org/all/20240122071324.2099712-3-surenb@xxxxxxxxxx/ > > fs/proc/internal.h | 2 + > fs/proc/task_mmu.c | 113 +++++++++++++++++++++++++++++++++++--- > include/linux/mm_inline.h | 18 ++++++ > 3 files changed, 126 insertions(+), 7 deletions(-) > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index a71ac5379584..e0247225bb68 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -290,6 +290,8 @@ struct proc_maps_private { > struct task_struct *task; > struct mm_struct *mm; > struct vma_iterator iter; > + unsigned long mm_wr_seq; > + struct vm_area_struct vma_copy; > #ifdef CONFIG_NUMA > struct mempolicy *task_mempolicy; > #endif > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 3f78ebbb795f..0d5a515156ee 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -126,11 +126,95 @@ static void release_task_mempolicy(struct proc_maps_private *priv) > } > #endif > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > - loff_t *ppos) > +#ifdef CONFIG_PER_VMA_LOCK > + > +static const struct seq_operations proc_pid_maps_op; > + > +/* > + * Take VMA snapshot and pin vm_file and anon_name as they are used by > + * show_map_vma. > + */ > +static int get_vma_snapshot(struct proc_maps_private *priv, struct vm_area_struct *vma) > +{ > + struct vm_area_struct *copy = &priv->vma_copy; > + int ret = -EAGAIN; > + > + memcpy(copy, vma, sizeof(*vma)); > + if (copy->vm_file && !get_file_rcu(©->vm_file)) There is a problem in this patchset. I assumed that get_file_rcu() can be used against vma->vm_file but that's not true. vma->vm_file is freed via a call to fput() which schedules its freeing using schedule_delayed_work(..., 1) but I don't think that constitutes RCU grace period, so it can be freed from under us. Andrew, could you please remove this patchset from your tree until I sort this out? Thanks, Suren. > + goto out; > + > + if (!anon_vma_name_get_if_valid(copy)) > + goto put_file; > + > + if (priv->mm_wr_seq == mmap_write_seq_read(priv->mm)) > + return 0; > + > + /* Address space got modified, vma might be stale. Wait and retry. */ > + rcu_read_unlock(); > + ret = mmap_read_lock_killable(priv->mm); > + mmap_write_seq_record(priv->mm, &priv->mm_wr_seq); > + mmap_read_unlock(priv->mm); > + rcu_read_lock(); > + > + if (!ret) > + ret = -EAGAIN; /* no other errors, ok to retry */ > + > + anon_vma_name_put_if_valid(copy); > +put_file: > + if (copy->vm_file) > + fput(copy->vm_file); > +out: > + return ret; > +} > + > +static void put_vma_snapshot(struct proc_maps_private *priv) > +{ > + struct vm_area_struct *vma = &priv->vma_copy; > + > + anon_vma_name_put_if_valid(vma); > + if (vma->vm_file) > + fput(vma->vm_file); > +} > + > +static inline bool needs_mmap_lock(struct seq_file *m) > +{ > + /* > + * smaps and numa_maps perform page table walk, therefore require > + * mmap_lock but maps can be read under RCU. > + */ > + return m->op != &proc_pid_maps_op; > +} > + > +#else /* CONFIG_PER_VMA_LOCK */ > + > +/* Without per-vma locks VMA access is not RCU-safe */ > +static inline bool needs_mmap_lock(struct seq_file *m) { return true; } > + > +#endif /* CONFIG_PER_VMA_LOCK */ > + > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > { > + struct proc_maps_private *priv = m->private; > struct vm_area_struct *vma = vma_next(&priv->iter); > > +#ifdef CONFIG_PER_VMA_LOCK > + if (vma && !needs_mmap_lock(m)) { > + int ret; > + > + put_vma_snapshot(priv); > + while ((ret = get_vma_snapshot(priv, vma)) == -EAGAIN) { > + /* lookup the vma at the last position again */ > + vma_iter_init(&priv->iter, priv->mm, *ppos); > + vma = vma_next(&priv->iter); > + } > + > + if (ret) { > + put_vma_snapshot(priv); > + return NULL; > + } > + vma = &priv->vma_copy; > + } > +#endif > if (vma) { > *ppos = vma->vm_start; > } else { > @@ -169,12 +253,20 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > return ERR_PTR(-EINTR); > } > > + /* Drop mmap_lock if possible */ > + if (!needs_mmap_lock(m)) { > + mmap_write_seq_record(priv->mm, &priv->mm_wr_seq); > + mmap_read_unlock(priv->mm); > + rcu_read_lock(); > + memset(&priv->vma_copy, 0, sizeof(priv->vma_copy)); > + } > + > vma_iter_init(&priv->iter, mm, last_addr); > hold_task_mempolicy(priv); > if (last_addr == -2UL) > return get_gate_vma(mm); > > - return proc_get_vma(priv, ppos); > + return proc_get_vma(m, ppos); > } > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > @@ -183,7 +275,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > *ppos = -1UL; > return NULL; > } > - return proc_get_vma(m->private, ppos); > + return proc_get_vma(m, ppos); > } > > static void m_stop(struct seq_file *m, void *v) > @@ -195,7 +287,10 @@ static void m_stop(struct seq_file *m, void *v) > return; > > release_task_mempolicy(priv); > - mmap_read_unlock(mm); > + if (needs_mmap_lock(m)) > + mmap_read_unlock(mm); > + else > + rcu_read_unlock(); > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > @@ -283,8 +378,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) > start = vma->vm_start; > end = vma->vm_end; > show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); > - if (mm) > - anon_name = anon_vma_name(vma); > + if (mm) { > + anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) : > + anon_vma_name_get_rcu(vma); > + } > > /* > * Print the dentry name for named mappings, and a > @@ -338,6 +435,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) > seq_puts(m, name); > } > seq_putc(m, '\n'); > + if (anon_name && !needs_mmap_lock(m)) > + anon_vma_name_put(anon_name); > } > > static int show_map(struct seq_file *m, void *v) > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h > index bbdb0ca857f1..a4a644fe005e 100644 > --- a/include/linux/mm_inline.h > +++ b/include/linux/mm_inline.h > @@ -413,6 +413,21 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, > > struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma); > > +/* > + * Takes a reference if anon_vma is valid and stable (has references). > + * Fails only if anon_vma is valid but we failed to get a reference. > + */ > +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma) > +{ > + return !vma->anon_name || anon_vma_name_get_rcu(vma); > +} > + > +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma) > +{ > + if (vma->anon_name) > + anon_vma_name_put(vma->anon_name); > +} > + > #else /* CONFIG_ANON_VMA_NAME */ > static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} > static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} > @@ -432,6 +447,9 @@ struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma) > return NULL; > } > > +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma) { return true; } > +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma) {} > + > #endif /* CONFIG_ANON_VMA_NAME */ > > static inline void init_tlb_flush_pending(struct mm_struct *mm) > -- > 2.43.0.429.g432eaa2c6b-goog >