Re: [PATCH] bpf: don't call mmap_read_trylock() from IRQ context

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello.

On 2024/06/15 1:40, Alexei Starovoitov wrote:
> On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
> <penguin-kernel@xxxxxxxxxxxxxxxxxxx> wrote:
>> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
>> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
> 
> imo the issue is elsewhere. syzbot reports:
> local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
>  __mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
>  __mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]
> 
> it complains about:
> local_lock(&memcg_paths.lock);
> in TRACE_MMAP_LOCK_EVENT.
> which looks like a false positive.

Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock
acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro as below.

#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
       do {                                                                   \
               const char *memcg_path;                                        \
               preempt_disable();                                             \
               memcg_path = get_mm_memcg_path(mm);                            \
               trace_mmap_lock_##type(mm,                                     \
                                      memcg_path != NULL ? memcg_path : "",   \
                                      ##__VA_ARGS__);                         \
               if (likely(memcg_path != NULL))                                \
                       put_memcg_path_buf();                                  \
               preempt_enable();                                              \
       } while (0)

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") moved the location of this macro.

Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling
preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock)
based on an argument that preempt_disable() has to be avoided because
get_mm_memcg_path() might sleep if PREEMPT_RT=y.

The local_lock() macro is defined as

  #define local_lock(lock)		__local_lock(lock)

in include/linux/local_lock.h and __local_lock() macro is defined as

  #define __local_lock(lock)					\
  	do {							\
  		preempt_disable();				\
  		local_lock_acquire(this_cpu_ptr(lock));		\
  	} while (0)

if PREEMPT_RT=n or

  #define __local_lock(__lock)					\
  	do {							\
  		migrate_disable();				\
  		spin_lock(this_cpu_ptr((__lock)));		\
  	} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .
Note that the difference between preempt_disable() and migrate_disable().

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") by error replaced local_lock() with preempt_disable(),
and commit e904c2ccf9b5 ("mm: mmap_lock: fix disabling preemption
directly") replaced preempt_disable() with local_lock().



Now, syzbot started reporting

  inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.

and

  inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.

messages, for local_lock() does not disable IRQ.

I think we can replace local_lock() with local_lock_irqsave() like

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1901ad22ccbd 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,15 @@ static inline void put_memcg_path_buf(void)
 #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 	do {                                                                   \
 		const char *memcg_path;                                        \
-		local_lock(&memcg_paths.lock);                                 \
+		unsigned long flags;                                           \
+		local_lock_irqsave(&memcg_paths.lock, flags);                  \
 		memcg_path = get_mm_memcg_path(mm);                            \
 		trace_mmap_lock_##type(mm,                                     \
 				       memcg_path != NULL ? memcg_path : "",   \
 				       ##__VA_ARGS__);                         \
 		if (likely(memcg_path != NULL))                                \
 			put_memcg_path_buf();                                  \
-		local_unlock(&memcg_paths.lock);                               \
+		local_unlock_irqrestore(&memcg_paths.lock, flags);             \
 	} while (0)
 
 #else /* !CONFIG_MEMCG */

because local_lock_irqsave() is defined as

  #define local_lock_irqsave(lock, flags)	__local_lock_irqsave(lock, flags)

in include/linux/local_lock.h and __local_lock_irqsave() macro is defined as

  #define __local_lock_irqsave(lock, flags)			\
  	do {							\
  		local_irq_save(flags);				\
  		local_lock_acquire(this_cpu_ptr(lock));		\
  	} while (0)

if PREEMPT_RT=n or

  #define __local_lock_irqsave(lock, flags)			\
  	do {							\
  		typecheck(unsigned long, flags);		\
  		flags = 0;					\
  		__local_lock(lock);				\
  	} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .



But a fundamental question arises here; why do we need to hold
memcg_paths.lock here, for as of commit 44ef20baed8e ("Merge tag
's390-6.10-4' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux"),
"git grep -nF memcg_paths" indicates that memcg_paths.lock is used within
only TRACE_MMAP_LOCK_EVENT() macro.

  mm/mmap_lock.c:48:static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
  mm/mmap_lock.c:63:              memcg_path = per_cpu_ptr(&memcg_paths, cpu);
  mm/mmap_lock.c:101:             rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
  mm/mmap_lock.c:135:     struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
  mm/mmap_lock.c:152:     local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
  mm/mmap_lock.c:159:             local_lock(&memcg_paths.lock);                                 \
  mm/mmap_lock.c:166:             local_unlock(&memcg_paths.lock);                               \

That is, what is the reason we can't revert commit 832b50725373 and
make the TRACE_MMAP_LOCK_EVENT() macro to branch directly like below?

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1e440c7d48e6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,19 +153,35 @@ static inline void put_memcg_path_buf(void)
 	rcu_read_unlock();
 }
 
+#ifndef CONFIG_PREEMPT_RT
 #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 	do {                                                                   \
 		const char *memcg_path;                                        \
-		local_lock(&memcg_paths.lock);                                 \
+		preempt_disable();                                             \
 		memcg_path = get_mm_memcg_path(mm);                            \
 		trace_mmap_lock_##type(mm,                                     \
 				       memcg_path != NULL ? memcg_path : "",   \
 				       ##__VA_ARGS__);                         \
 		if (likely(memcg_path != NULL))                                \
 			put_memcg_path_buf();                                  \
-		local_unlock(&memcg_paths.lock);                               \
+		preempt_enable();                                              \
 	} while (0)
 
+#else
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
+	do {                                                                   \
+		const char *memcg_path;                                        \
+		migrate_disable();                                             \
+		memcg_path = get_mm_memcg_path(mm);                            \
+		trace_mmap_lock_##type(mm,                                     \
+				       memcg_path != NULL ? memcg_path : "",   \
+				       ##__VA_ARGS__);                         \
+		if (likely(memcg_path != NULL))                                \
+			put_memcg_path_buf();                                  \
+		migrate_enable();                                              \
+	} while (0)
+#endif
+
 #else /* !CONFIG_MEMCG */
 
 int trace_mmap_lock_reg(void)

Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
return NULL if preemption (or interrupt or recursion if any) exhausted the per
cpu buffer?

  /*
   * How many contexts our trace events might be called in: normal, softirq, irq,
   * and NMI.
   */
  #define CONTEXT_COUNT 4





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux