Re: [PATCH v4 07/11] mm: memcontrol: make all the callers of {folio,page}_memcg() safe

Roman Gushchin <roman.gushchin@xxxxxxxxx> · Tue, 24 May 2022 20:03:41 -0700

On Tue, May 24, 2022 at 02:05:47PM +0800, Muchun Song wrote:
> When we use objcg APIs to charge the LRU pages, the page will not hold
> a reference to the memcg associated with the page. So the caller of the
> {folio,page}_memcg() should hold an rcu read lock or obtain a reference
> to the memcg associated with the page to protect memcg from being
> released. So introduce get_mem_cgroup_from_{page,folio}() to obtain a
> reference to the memory cgroup associated with the page.
> 
> In this patch, make all the callers hold an rcu read lock or obtain a
> reference to the memcg to protect memcg from being released when the LRU
> pages reparented.
> 
> We do not need to adjust the callers of {folio,page}_memcg() during
> the whole process of mem_cgroup_move_task(). Because the cgroup migration
> and memory cgroup offlining are serialized by @cgroup_mutex. In this
> routine, the LRU pages cannot be reparented to its parent memory cgroup.
> So {folio,page}_memcg() is stable and cannot be released.
> 
> This is a preparation for reparenting the LRU pages.
> 
> Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>

Overall the patch looks good to me (some nits below).

> ---
>  fs/buffer.c                      |  4 +--
>  fs/fs-writeback.c                | 23 ++++++++--------
>  include/linux/memcontrol.h       | 51 ++++++++++++++++++++++++++++++++---
>  include/trace/events/writeback.h |  5 ++++
>  mm/memcontrol.c                  | 58 ++++++++++++++++++++++++++++++----------
>  mm/migrate.c                     |  4 +++
>  mm/page_io.c                     |  5 ++--
>  7 files changed, 117 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 2b5561ae5d0b..80975a457670 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
>  	if (retry)
>  		gfp |= __GFP_NOFAIL;
>  
> -	/* The page lock pins the memcg */
> -	memcg = page_memcg(page);
> +	memcg = get_mem_cgroup_from_page(page);

Looking at these changes I wonder if we need to remove unsafe getters or
at least add a BOLD comment on how/when it's safe to use them.

>  	old_memcg = set_active_memcg(memcg);
>  
>  	head = NULL;
> @@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
>  		set_bh_page(bh, page, offset);
>  	}
>  out:
> +	mem_cgroup_put(memcg);
>  	set_active_memcg(old_memcg);
>  	return head;
>  /*
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 1fae0196292a..56612ace8778 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -243,15 +243,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
>  	if (inode_cgwb_enabled(inode)) {
>  		struct cgroup_subsys_state *memcg_css;
>  
> -		if (page) {
> -			memcg_css = mem_cgroup_css_from_page(page);
> -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> -		} else {
> -			/* must pin memcg_css, see wb_get_create() */
> +		/* must pin memcg_css, see wb_get_create() */
> +		if (page)
> +			memcg_css = get_mem_cgroup_css_from_page(page);
> +		else
>  			memcg_css = task_get_css(current, memory_cgrp_id);
> -			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> -			css_put(memcg_css);
> -		}
> +		wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> +		css_put(memcg_css);
>  	}
>  
>  	if (!wb)
> @@ -868,16 +866,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
>  	if (!wbc->wb || wbc->no_cgroup_owner)
>  		return;
>  
> -	css = mem_cgroup_css_from_page(page);
> +	css = get_mem_cgroup_css_from_page(page);
>  	/* dead cgroups shouldn't contribute to inode ownership arbitration */
>  	if (!(css->flags & CSS_ONLINE))
> -		return;
> +		goto out;
>  
>  	id = css->id;
>  
>  	if (id == wbc->wb_id) {
>  		wbc->wb_bytes += bytes;
> -		return;
> +		goto out;
>  	}
>  
>  	if (id == wbc->wb_lcand_id)
> @@ -890,6 +888,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
>  		wbc->wb_tcand_bytes += bytes;
>  	else
>  		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
> +
> +out:
> +	css_put(css);
>  }
>  EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
>  
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 8c2f1ba2f471..3a0e2592434e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -373,7 +373,7 @@ static inline bool folio_memcg_kmem(struct folio *folio);
>   * a valid memcg, but can be atomically swapped to the parent memcg.
>   *
>   * The caller must ensure that the returned memcg won't be released:
> - * e.g. acquire the rcu_read_lock or css_set_lock.
> + * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
>   */
>  static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
>  {
> @@ -454,7 +454,37 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
>  	return folio_memcg(page_folio(page));
>  }
>  
> -/**
> +/*
> + * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
> + *			       associated with a folio.
> + * @folio: Pointer to the folio.
> + *
> + * Returns a pointer to the memory cgroup (and obtain a reference on it)
> + * associated with the folio, or NULL. This function assumes that the
> + * folio is known to have a proper memory cgroup pointer. It's not safe
> + * to call this function against some type of pages, e.g. slab pages or
> + * ex-slab pages.
> + */
> +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	rcu_read_lock();
> +retry:
> +	memcg = folio_memcg(folio);
> +	if (unlikely(memcg && !css_tryget(&memcg->css)))
> +		goto retry;
> +	rcu_read_unlock();
> +
> +	return memcg;
> +}
> +
> +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> +{
> +	return get_mem_cgroup_from_folio(page_folio(page));
> +}
> +
> +/*
>   * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
>   * @folio: Pointer to the folio.
>   *
> @@ -873,7 +903,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
>  	return match;
>  }
>  
> -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
> +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
>  ino_t page_cgroup_ino(struct page *page);
>  
>  static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
> @@ -1047,10 +1077,13 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
>  static inline void count_memcg_page_event(struct page *page,
>  					  enum vm_event_item idx)
>  {
> -	struct mem_cgroup *memcg = page_memcg(page);
> +	struct mem_cgroup *memcg;
>  
> +	rcu_read_lock();
> +	memcg = page_memcg(page);
>  	if (memcg)
>  		count_memcg_events(memcg, idx, 1);
> +	rcu_read_unlock();
>  }
>  
>  static inline void count_memcg_event_mm(struct mm_struct *mm,
> @@ -1129,6 +1162,16 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
>  	return NULL;
>  }
>  
> +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +	return NULL;
> +}
> +
> +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> +{
> +	return NULL;
> +}
> +
>  static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
>  {
>  	WARN_ON_ONCE(!rcu_read_lock_held());
> diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> index 86b2a82da546..cdb822339f13 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -258,6 +258,11 @@ TRACE_EVENT(track_foreign_dirty,
>  		__entry->ino		= inode ? inode->i_ino : 0;
>  		__entry->memcg_id	= wb->memcg_css->id;
>  		__entry->cgroup_ino	= __trace_wb_assign_cgroup(wb);
> +		/*
> +		 * TP_fast_assign() is under preemption disabled which can
> +		 * serve as an RCU read-side critical section so that the
> +		 * memcg returned by folio_memcg() cannot be freed.
> +		 */
>  		__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
>  	),
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b38a77f6696f..dcaf6cf5dc74 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -371,7 +371,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
>  #endif
>  
>  /**
> - * mem_cgroup_css_from_page - css of the memcg associated with a page
> + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
>   * @page: page of interest
>   *
>   * If memcg is bound to the default hierarchy, css of the memcg associated
> @@ -381,13 +381,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
>   * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
>   * is returned.
>   */
> -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
> +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
>  {
>  	struct mem_cgroup *memcg;
>  
> -	memcg = page_memcg(page);
> +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +		return &root_mem_cgroup->css;
>  
> -	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +	memcg = get_mem_cgroup_from_page(page);
> +	if (!memcg)
>  		memcg = root_mem_cgroup;
>  
>  	return &memcg->css;
> @@ -770,13 +772,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
>  			     int val)
>  {
> -	struct page *head = compound_head(page); /* rmap on tail pages */
> +	struct folio *folio = page_folio(page); /* rmap on tail pages */
>  	struct mem_cgroup *memcg;
>  	pg_data_t *pgdat = page_pgdat(page);
>  	struct lruvec *lruvec;
>  
>  	rcu_read_lock();
> -	memcg = page_memcg(head);
> +	memcg = folio_memcg(folio);
>  	/* Untracked pages have no memcg, no lruvec. Update only the node */
>  	if (!memcg) {
>  		rcu_read_unlock();
> @@ -2058,7 +2060,9 @@ void folio_memcg_lock(struct folio *folio)
>  	 * The RCU lock is held throughout the transaction.  The fast
>  	 * path can get away without acquiring the memcg->move_lock
>  	 * because page moving starts with an RCU grace period.
> -         */
> +	 *
> +	 * The RCU lock also protects the memcg from being freed.
> +	 */
>  	rcu_read_lock();
>  
>  	if (mem_cgroup_disabled())
> @@ -3296,7 +3300,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
>  void split_page_memcg(struct page *head, unsigned int nr)
>  {
>  	struct folio *folio = page_folio(head);
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
>  	int i;
>  
>  	if (mem_cgroup_disabled() || !memcg)
> @@ -3309,6 +3313,8 @@ void split_page_memcg(struct page *head, unsigned int nr)
>  		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
>  	else
>  		css_get_many(&memcg->css, nr - 1);
> +
> +	css_put(&memcg->css);
>  }
>  
>  #ifdef CONFIG_MEMCG_SWAP
> @@ -4511,7 +4517,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>  void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
>  					     struct bdi_writeback *wb)
>  {
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> +	struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
>  	struct memcg_cgwb_frn *frn;
>  	u64 now = get_jiffies_64();
>  	u64 oldest_at = now;
> @@ -4558,6 +4564,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
>  		frn->memcg_id = wb->memcg_css->id;
>  		frn->at = now;
>  	}
> +	css_put(&memcg->css);
>  }
>  
>  /* issue foreign writeback flushes for recorded foreign dirtying events */
> @@ -6092,6 +6099,14 @@ static void mem_cgroup_move_charge(void)
>  	atomic_dec(&mc.from->moving_account);
>  }
>  
> +/*
> + * The cgroup migration and memory cgroup offlining are serialized by
> + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
> + * be reparented to its parent memory cgroup. So during the whole process
> + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
> + * need to worry about the memcg (returned from page_memcg()) being
> + * released even if we do not hold an rcu read lock.
> + */
>  static void mem_cgroup_move_task(void)
>  {
>  	if (mc.to) {
> @@ -6895,7 +6910,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
>  	if (folio_memcg(new))
>  		return;
>  
> -	memcg = folio_memcg(old);
> +	memcg = get_mem_cgroup_from_folio(old);
>  	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
>  	if (!memcg)
>  		return;
> @@ -6914,6 +6929,8 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
>  	mem_cgroup_charge_statistics(memcg, nr_pages);
>  	memcg_check_events(memcg, folio_nid(new));
>  	local_irq_restore(flags);
> +
> +	css_put(&memcg->css);
>  }
>  
>  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
> @@ -7100,6 +7117,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
>  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return;
>  
> +	/*
> +	 * Interrupts should be disabled by the caller (see the comments below),
> +	 * which can serve as RCU read-side critical sections.
> +	 */
>  	memcg = folio_memcg(folio);
>  
>  	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> @@ -7165,15 +7186,16 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return 0;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  
>  	VM_WARN_ON_ONCE_PAGE(!memcg, page);
>  	if (!memcg)
> -		return 0;
> +		goto out;
>  
>  	if (!entry.val) {
>  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> -		return 0;
> +		goto out;
>  	}
>  
>  	memcg = mem_cgroup_id_get_online(memcg);
> @@ -7183,6 +7205,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
>  		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
>  		mem_cgroup_id_put(memcg);
> +		rcu_read_unlock();
>  		return -ENOMEM;

If you add the "out" label, please use it here too.

>  	}
>  
> @@ -7192,6 +7215,8 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
>  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
>  	VM_BUG_ON_PAGE(oldid, page);
>  	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
> +out:
> +	rcu_read_unlock();
>  
>  	return 0;
>  }
> @@ -7246,17 +7271,22 @@ bool mem_cgroup_swap_full(struct page *page)
>  	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  		return false;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  	if (!memcg)
> -		return false;
> +		goto out;
>  
>  	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
>  		unsigned long usage = page_counter_read(&memcg->swap);
>  
>  		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
> -		    usage * 2 >= READ_ONCE(memcg->swap.max))
> +		    usage * 2 >= READ_ONCE(memcg->swap.max)) {
> +			rcu_read_unlock();
>  			return true;

Please, make something like
	ret = true;
	goto out;
here. It will be more consistent.

> +		}
>  	}
> +out:
> +	rcu_read_unlock();
>  
>  	return false;
>  }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6c31ee1e1c9b..59e97a8a64a0 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -430,6 +430,10 @@ int folio_migrate_mapping(struct address_space *mapping,
>  		struct lruvec *old_lruvec, *new_lruvec;
>  		struct mem_cgroup *memcg;
>  
> +		/*
> +		 * Irq is disabled, which can serve as RCU read-side critical
> +		 * sections.
> +		 */
>  		memcg = folio_memcg(folio);
>  		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
>  		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 89fbf3cae30f..a0d9cd68e87a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -221,13 +221,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
>  	struct cgroup_subsys_state *css;
>  	struct mem_cgroup *memcg;
>  
> +	rcu_read_lock();
>  	memcg = page_memcg(page);
>  	if (!memcg)
> -		return;
> +		goto out;
>  
> -	rcu_read_lock();
>  	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
>  	bio_associate_blkg_from_css(bio, css);
> +out:
>  	rcu_read_unlock();
>  }
>  #else
> -- 
> 2.11.0
>