Re: [PATCH v8 08/15] x86/sgx: Implement EPC reclamation flows for cgroup

"Jarkko Sakkinen" <jarkko@xxxxxxxxxx> · Fri, 02 Feb 2024 01:39:07 +0200

On Tue Jan 30, 2024 at 4:09 AM EET, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
>
> Implement the reclamation flow for cgroup, encapsulated in the top-level
> function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its
> subtree, and make calls to sgx_reclaim_pages() at each node passing in
> the LRU of that node. It keeps track of total reclaimed pages, and pages
> left to attempt.  It stops the walk if desired number of pages are
> attempted.
>
> In some contexts, e.g. page fault handling, only asynchronous
> reclamation is allowed. Create a work-queue, corresponding work item and
> function definitions to support the asynchronous reclamation. Both
> synchronous and asynchronous flows invoke the same top level reclaim
> function, and will be triggered later by sgx_epc_cgroup_try_charge()
> when usage of the cgroup is at or near its limit.
>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
> Co-developed-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> Signed-off-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> ---
> V8:
> - Remove alignment for substructure variables. (Jarkko)
>
> V7:
> - Split this out from the big patch, #10 in V6. (Dave, Kai)
> ---
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c | 174 ++++++++++++++++++++++++++-
>  arch/x86/kernel/cpu/sgx/epc_cgroup.h |   3 +
>  2 files changed, 176 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> index eac8548164de..8858a0850f8a 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> @@ -7,9 +7,173 @@
>  
>  static struct sgx_epc_cgroup epc_cg_root;
>  
> +static struct workqueue_struct *sgx_epc_cg_wq;

I'd document this with a comment.

Missed this for "epc_eg_root", which should have the same treatment.

Just brief 1-2 line thing, no need for prose here. Something to remind
what it is later on.

> +
> +static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg)
> +{
> +	return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE;
> +}
> +
> +static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg)
> +{
> +	return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE;
> +}
> +
> +/*
> + * Get the lower bound of limits of a cgroup and its ancestors.  Used in
> + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is
> + * over its limit or its ancestors' hence reclamation is needed.
> + */
> +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
> +{
> +	struct misc_cg *i = epc_cg->cg;
> +	u64 m = U64_MAX;
> +
> +	while (i) {
> +		m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
> +		i = misc_cg_parent(i);
> +	}
> +
> +	return m / PAGE_SIZE;
> +}
> +
>  /**
> - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
> + * @root:	Root of the tree to check
>   *
> + * Return: %true if all cgroups under the specified root have empty LRU lists.
> + * Used to avoid livelocks due to a cgroup having a non-zero charge count but
> + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
> + * because all pages in the cgroup are unreclaimable.
> + */
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
> +{
> +	struct cgroup_subsys_state *css_root;
> +	struct cgroup_subsys_state *pos;
> +	struct sgx_epc_cgroup *epc_cg;
> +	bool ret = true;
> +
> +	/*
> +	 * Caller ensure css_root ref acquired
> +	 */
> +	css_root = &root->css;
> +
> +	rcu_read_lock();
> +	css_for_each_descendant_pre(pos, css_root) {
> +		if (!css_tryget(pos))
> +			break;
> +
> +		rcu_read_unlock();
> +
> +		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +
> +		spin_lock(&epc_cg->lru.lock);
> +		ret = list_empty(&epc_cg->lru.reclaimable);
> +		spin_unlock(&epc_cg->lru.lock);
> +
> +		rcu_read_lock();
> +		css_put(pos);
> +		if (!ret)
> +			break;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +/**
> + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
> + * @root:	Root of the tree to start walking
> + * Return:	Number of pages reclaimed.
> + */
> +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> +{
> +	/*
> +	 * Attempting to reclaim only a few pages will often fail and is
> +	 * inefficient, while reclaiming a huge number of pages can result in
> +	 * soft lockups due to holding various locks for an extended duration.
> +	 */
> +	unsigned int nr_to_scan = SGX_NR_TO_SCAN;
> +	struct cgroup_subsys_state *css_root;
> +	struct cgroup_subsys_state *pos;
> +	struct sgx_epc_cgroup *epc_cg;
> +	unsigned int cnt;
> +
> +	 /* Caller ensure css_root ref acquired */
> +	css_root = &root->css;
> +
> +	cnt = 0;
> +	rcu_read_lock();
> +	css_for_each_descendant_pre(pos, css_root) {
> +		if (!css_tryget(pos))
> +			break;
> +		rcu_read_unlock();
> +
> +		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
> +
> +		rcu_read_lock();
> +		css_put(pos);
> +		if (!nr_to_scan)
> +			break;
> +	}
> +
> +	rcu_read_unlock();
> +	return cnt;
> +}
> +
> +/*
> + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup
> + * when the cgroup is at/near its maximum capacity
> + */
> +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
> +{
> +	struct sgx_epc_cgroup *epc_cg;
> +	u64 cur, max;
> +
> +	epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
> +
> +	for (;;) {
> +		max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
> +
> +		/*
> +		 * Adjust the limit down by one page, the goal is to free up
> +		 * pages for fault allocations, not to simply obey the limit.
> +		 * Conditionally decrementing max also means the cur vs. max
> +		 * check will correctly handle the case where both are zero.
> +		 */
> +		if (max)
> +			max--;
> +
> +		/*
> +		 * Unless the limit is extremely low, in which case forcing
> +		 * reclaim will likely cause thrashing, force the cgroup to
> +		 * reclaim at least once if it's operating *near* its maximum
> +		 * limit by adjusting @max down by half the min reclaim size.
> +		 * This work func is scheduled by sgx_epc_cgroup_try_charge
> +		 * when it cannot directly reclaim due to being in an atomic
> +		 * context, e.g. EPC allocation in a fault handler.  Waiting
> +		 * to reclaim until the cgroup is actually at its limit is less
> +		 * performant as it means the faulting task is effectively
> +		 * blocked until a worker makes its way through the global work
> +		 * queue.
> +		 */
> +		if (max > SGX_NR_TO_SCAN * 2)
> +			max -= (SGX_NR_TO_SCAN / 2);
> +
> +		cur = sgx_epc_cgroup_page_counter_read(epc_cg);
> +
> +		if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
> +			break;
> +
> +		/* Keep reclaiming until above condition is met. */
> +		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
> +	}
> +}
> +
> +/**
> + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
>   * @epc_cg:	The EPC cgroup to be charged for the page.
>   * Return:
>   * * %0 - If successfully charged.
> @@ -37,6 +201,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
>  	if (!epc_cg)
>  		return;
>  
> +	cancel_work_sync(&epc_cg->reclaim_work);
>  	kfree(epc_cg);
>  }
>  
> @@ -49,6 +214,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = {
>  
>  static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg)
>  {
> +	sgx_lru_init(&epc_cg->lru);
> +	INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
>  	cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
>  	epc_cg->cg = cg;
>  }
> @@ -68,6 +235,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
>  
>  void sgx_epc_cgroup_init(void)
>  {
> +	sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
> +					WQ_UNBOUND | WQ_FREEZABLE,
> +					WQ_UNBOUND_MAX_ACTIVE);
> +	BUG_ON(!sgx_epc_cg_wq);
> +
>  	misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops);
>  	sgx_epc_misc_init(misc_cg_root(), &epc_cg_root);
>  }
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> index 6b664b4c321f..e3c6a08f0ee8 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> @@ -34,6 +34,8 @@ static inline void sgx_epc_cgroup_init(void) { }
>  #else
>  struct sgx_epc_cgroup {
>  	struct misc_cg *cg;
> +	struct sgx_epc_lru_list lru;
> +	struct work_struct reclaim_work;
>  };
>  
>  static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
> @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg)
>  
>  int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg);
>  void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);
>  void sgx_epc_cgroup_init(void);
>  
>  #endif

BR, Jarkko