Re: [PATCH 1/3] rcu: Use static initializer for krc.lock

Uladzislau Rezki <urezki@xxxxxxxxx> · Sun, 19 Apr 2020 14:15:06 +0200

On Thu, Apr 16, 2020 at 11:05:15PM -0400, Joel Fernandes wrote:
> On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > 
> > > We might need different calling-context restrictions for the two variants
> > > of kfree_rcu().  And we might need to come up with some sort of lockdep
> > > check for "safe to use normal spinlock in -rt".
> > 
> > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > This one will scream if you do
> > 	raw_spin_lock();
> > 	spin_lock();
> > 
> > Sadly, as of today, there is code triggering this which needs to be
> > addressed first (but it is one list of things to do).
> > 
> > Given the thread so far, is it okay if I repost the series with
> > migrate_disable() instead of accepting a possible migration before
> > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > memory allocations in a possible atomic context) until we get there.
> 
> I prefer something like the following to make it possible to invoke
> kfree_rcu() from atomic context considering call_rcu() is already callable
> from such contexts. Thoughts?
> 
> (Only build tested)
> ---8<-----------------------
> 
> From: "Joel Fernandes (Google)" <joel@xxxxxxxxxxxxxxxxx>
> Subject: [PATCH] rcu/tree: Avoid allocating in non-preemptible context for
>  PREEMPT_RT kernels
> 
> Per recent discussions, kfree_rcu() is a low-level facility which should be
> callable in atomic context (raw spinlock sections, IRQ disable sections etc).
> 
> However, it depends on page allocation which acquires sleeping locks on
> PREEMPT_RT.
> 
> In order to support all usecases, avoid the allocation of pages for
> PREEMPT_RT. The page allocation is just an optimization which does not
> break functionality. Further, in future patches the pages will be
> pre-allocated reducing the likelihood that page allocations will be
> needed.
> 
> We also convert the spinlock_t to raw_spinlock_t so that does not sleep
> in PREEMPT_RT's raw atomic critical sections.
> 
> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
> ---
>  kernel/rcu/tree.c | 42 +++++++++++++++++++++++++-----------------
>  1 file changed, 25 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f288477ee1c26..ba831712fb307 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2905,7 +2905,7 @@ struct kfree_rcu_cpu {
>  	struct kfree_rcu_bulk_data *bhead;
>  	struct kfree_rcu_bulk_data *bcached;
>  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> -	spinlock_t lock;
> +	raw_spinlock_t lock;
>  	struct delayed_work monitor_work;
>  	bool monitor_todo;
>  	bool initialized;
> @@ -2939,12 +2939,12 @@ static void kfree_rcu_work(struct work_struct *work)
>  	krwp = container_of(to_rcu_work(work),
>  			    struct kfree_rcu_cpu_work, rcu_work);
>  	krcp = krwp->krcp;
> -	spin_lock_irqsave(&krcp->lock, flags);
> +	raw_spin_lock_irqsave(&krcp->lock, flags);
>  	head = krwp->head_free;
>  	krwp->head_free = NULL;
>  	bhead = krwp->bhead_free;
>  	krwp->bhead_free = NULL;
> -	spin_unlock_irqrestore(&krcp->lock, flags);
> +	raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  	/* "bhead" is now private, so traverse locklessly. */
>  	for (; bhead; bhead = bnext) {
> @@ -3047,14 +3047,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
>  	krcp->monitor_todo = false;
>  	if (queue_kfree_rcu_work(krcp)) {
>  		// Success! Our job is done here.
> -		spin_unlock_irqrestore(&krcp->lock, flags);
> +		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  		return;
>  	}
>  
>  	// Previous RCU batch still in progress, try again later.
>  	krcp->monitor_todo = true;
>  	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> -	spin_unlock_irqrestore(&krcp->lock, flags);
> +	raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
>  /*
> @@ -3067,16 +3067,16 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
>  						 monitor_work.work);
>  
> -	spin_lock_irqsave(&krcp->lock, flags);
> +	raw_spin_lock_irqsave(&krcp->lock, flags);
>  	if (krcp->monitor_todo)
>  		kfree_rcu_drain_unlock(krcp, flags);
>  	else
> -		spin_unlock_irqrestore(&krcp->lock, flags);
> +		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
>  static inline bool
>  kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> -	struct rcu_head *head, rcu_callback_t func)
> +	struct rcu_head *head, rcu_callback_t func, bool alloc)
>  {
>  	struct kfree_rcu_bulk_data *bnode;
>  
> @@ -3092,6 +3092,10 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
>  		if (!bnode) {
>  			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
>  
> +			/* If allocation is not allowed, don't do it. */
> +			if (!alloc)
> +				return false;
> +
>  			bnode = (struct kfree_rcu_bulk_data *)
>  				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
>  		}
Joel, we also need to drop the lock before alloc_pages(). We are not
allowed to allocate under raw_spin_lock held, because of it uses
sleepable spinlocks. If i miss something, please fix me :)

<snip>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ba831712fb30..9a334e3c7f96 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3076,7 +3076,8 @@ static void kfree_rcu_monitor(struct work_struct *work)

 static inline bool
 kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
-       struct rcu_head *head, rcu_callback_t func, bool alloc)
+       struct rcu_head *head, rcu_callback_t func, bool alloc,
+       unsigned long *flags)
 {
        struct kfree_rcu_bulk_data *bnode;

@@ -3096,8 +3097,22 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
                        if (!alloc)
                                return false;

+                       /* Drop the lock. */
+                       if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+                               migrate_disable();
+                               raw_spin_unlock(&krcp->lock);
+                               local_irq_restore(*flags);
+                       }
+
                        bnode = (struct kfree_rcu_bulk_data *)
                                __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+                       /* Grab the lock back. */
+                       if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+                               local_irq_save(*flags);
+                               raw_spin_lock(&krcp->lock);
+                               migrate_enable();
+                       }
                }

                /* Switch to emergency path. */
@@ -3164,7 +3179,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
         * Under high memory pressure GFP_NOWAIT can fail,
         * in that case the emergency path is maintained.
         */
-       if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc))) {
+       if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc, &flags))) {
                head->func = func;
                head->next = krcp->head;
                krcp->head = head;
<snip>

If everyone agrees to go with such solution, let's split this work into
at least two patches, one is converting to raw spinlocks and second one
is an allocation fix for RT case.

?

--
Vlad Rezki