Re: [tip: sched/core] sched/fair: Multi-LLC select_idle_sibling()

Marek Szyprowski <m.szyprowski@xxxxxxxxxxx> · Mon, 5 Jun 2023 17:25:30 +0200

On 31.05.2023 14:04, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID:     c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Gitweb:        https://git.kernel.org/tip/c7dfd6b9122d29d0e9a4587ab470c0564d7f92ab
> Author:        Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> AuthorDate:    Tue, 30 May 2023 13:20:46 +02:00
> Committer:     Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> CommitterDate: Tue, 30 May 2023 22:46:27 +02:00
>
> sched/fair: Multi-LLC select_idle_sibling()
>
> Tejun reported that when he targets workqueues towards a specific LLC
> on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
> significant idle time.
>
> This is, of course, because of how select_idle_sibling() will not
> consider anything outside of the local LLC, and since all these tasks
> are short running the periodic idle load balancer is ineffective.
>
> And while it is good to keep work cache local, it is better to not
> have significant idle time. Therefore, have select_idle_sibling() try
> other LLCs inside the same node when the local one comes up empty.
>
> Reported-by: Tejun Heo <tj@xxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>

This patch landed in today's linux next-20230605 as commit c5214e13ad60 
("sched/fair: Multi-LLC select_idle_sibling()"). Unfortunately it causes 
regression on my ARM 64bit Exynos5433-based TM2e test board during the 
CPU hotplug tests. From time to time I get the NULL pointer dereference. 
Reverting $subject on top of linux-next fixes the issue. Let me know if 
I can help somehow debugging this issue. Here is a complete log (I've 
intentionally kept all the stack dumps, although they don't look very 
relevant...):

# for i in /sys/devices/system/cpu/cpu[1-9]; do echo 0 >$i/online; done
Unable to handle kernel NULL pointer dereference at virtual address 
0000000000000090

======================================================
WARNING: possible circular locking dependency detected
6.4.0-rc1+ #13640 Not tainted
------------------------------------------------------
cpuhp/6/43 is trying to acquire lock:
ffff80000ab65598 (console_owner){..-.}-{0:0}, at: 
console_flush_all+0x1ac/0x4fc

but task is already holding lock:
ffff00002836ed48 (&p->pi_lock){-.-.}-{2:2}, at: try_to_wake_up+0x58/0x46c

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&p->pi_lock){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        try_to_wake_up+0x58/0x46c
        default_wake_function+0x14/0x20
        autoremove_wake_function+0x18/0x44
        __wake_up_common+0x94/0x170
        __wake_up_common_lock+0x7c/0xcc
        __wake_up+0x18/0x24
        tty_wakeup+0x34/0x70
        tty_port_default_wakeup+0x20/0x38
        tty_port_tty_wakeup+0x18/0x24
        uart_write_wakeup+0x18/0x28
        s3c24xx_serial_tx_chars+0x20c/0x218
        s3c64xx_serial_handle_irq+0x9c/0xe0
        __handle_irq_event_percpu+0xb0/0x2d4
        handle_irq_event+0x4c/0xb8
        handle_fasteoi_irq+0xa4/0x198
        generic_handle_domain_irq+0x2c/0x44
        gic_handle_irq+0x44/0xc4
        call_on_irq_stack+0x24/0x4c
        do_interrupt_handler+0x80/0x84
        el1_interrupt+0x34/0x64
        el1h_64_irq_handler+0x18/0x24
        el1h_64_irq+0x64/0x68
        default_idle_call+0x9c/0x150
        do_idle+0x230/0x294
        cpu_startup_entry+0x28/0x2c
        rest_init+0x100/0x190
        arch_post_acpi_subsys_init+0x0/0x8
        start_kernel+0x594/0x684
        __primary_switched+0xbc/0xc4

-> #2 (&tty->write_wait){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        __wake_up_common_lock+0x5c/0xcc
        __wake_up+0x18/0x24
        tty_wakeup+0x34/0x70
        tty_port_default_wakeup+0x20/0x38
        tty_port_tty_wakeup+0x18/0x24
        uart_write_wakeup+0x18/0x28
        s3c24xx_serial_tx_chars+0x20c/0x218
        s3c64xx_serial_handle_irq+0x9c/0xe0
        __handle_irq_event_percpu+0xb0/0x2d4
        handle_irq_event+0x4c/0xb8
        handle_fasteoi_irq+0xa4/0x198
        generic_handle_domain_irq+0x2c/0x44
        gic_handle_irq+0x44/0xc4
        call_on_irq_stack+0x24/0x4c
        do_interrupt_handler+0x80/0x84
        el1_interrupt+0x34/0x64
        el1h_64_irq_handler+0x18/0x24
        el1h_64_irq+0x64/0x68
        default_idle_call+0x9c/0x150
        do_idle+0x230/0x294
        cpu_startup_entry+0x28/0x2c
        rest_init+0x100/0x190
        arch_post_acpi_subsys_init+0x0/0x8
        start_kernel+0x594/0x684
        __primary_switched+0xbc/0xc4

-> #1 (&port_lock_key){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x88
        s3c24xx_serial_console_write+0xfc/0x124
        console_flush_all+0x208/0x4fc
        console_unlock+0x5c/0x14c
        vprintk_emit+0x15c/0x3b0
        vprintk_default+0x38/0x44
        vprintk+0xc0/0xe4
        _printk+0x5c/0x84
        register_console+0x1f4/0x420
        uart_add_one_port+0x50c/0x53c
        s3c24xx_serial_probe+0x34c/0x72c
        platform_probe+0x68/0xd8
        really_probe+0x148/0x2b4
        __driver_probe_device+0x78/0x12c
        driver_probe_device+0xd8/0x160
        __driver_attach+0x9c/0x1ac
        bus_for_each_dev+0x74/0xd4
        driver_attach+0x24/0x30
        bus_add_driver+0xe4/0x1e8
        driver_register+0x60/0x128
        __platform_driver_register+0x28/0x34
        samsung_serial_init+0x30/0x8c
        do_one_initcall+0x74/0x2f0
        kernel_init_freeable+0x288/0x4d8
        kernel_init+0x24/0x1dc
        ret_from_fork+0x10/0x20

-> #0 (console_owner){..-.}-{0:0}:
        __lock_acquire+0x13d0/0x217c
        lock_acquire+0x1e8/0x310
        console_flush_all+0x1f4/0x4fc
        console_unlock+0x5c/0x14c
        vprintk_emit+0x15c/0x3b0
        vprintk_default+0x38/0x44
        vprintk+0xc0/0xe4
        _printk+0x5c/0x84
        die_kernel_fault+0x48/0x37c
        __do_kernel_fault+0xd8/0x19c
        do_page_fault+0xac/0x6d8
        do_translation_fault+0xac/0xb8
        do_mem_abort+0x44/0x94
        el1_abort+0x44/0x70
        el1h_64_sync_handler+0xd8/0xe4
        el1h_64_sync+0x64/0x68
        __bitmap_and+0x4c/0x78
        select_task_rq_fair+0x724/0x1a30
        try_to_wake_up+0x17c/0x46c
        wake_up_process+0x18/0x24
        complete+0x58/0x8c
        __kthread_parkme+0x74/0xc8
        kthread_parkme+0x20/0x44
        smpboot_thread_fn+0x118/0x2a0
        kthread+0x124/0x128
        ret_from_fork+0x10/0x20

other info that might help us debug this:

Chain exists of:
   console_owner --> &tty->write_wait --> &p->pi_lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&p->pi_lock);
                                lock(&tty->write_wait);
                                lock(&p->pi_lock);
   lock(console_owner);

  *** DEADLOCK ***

5 locks held by cpuhp/6/43:
  #0: ffff000023e68440 (&x->wait){....}-{2:2}, at: complete+0x24/0x8c
  #1: ffff00002836ed48 (&p->pi_lock){-.-.}-{2:2}, at: 
try_to_wake_up+0x58/0x46c
  #2: ffff80000abd6ac0 (rcu_read_lock){....}-{1:2}, at: 
select_task_rq_fair+0x114/0x1a30
  #3: ffff80000ab65390 (console_lock){+.+.}-{0:0}, at: 
vprintk_default+0x38/0x44
  #4: ffff80000ab65440 (console_srcu){....}-{0:0}, at: 
console_flush_all+0x7c/0x4fc

stack backtrace:
CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
Hardware name: Samsung TM2E board (DT)
Call trace:
  dump_backtrace+0x98/0xf0
  show_stack+0x18/0x24
  dump_stack_lvl+0x60/0xac
  dump_stack+0x18/0x24
  print_circular_bug+0x26c/0x348
  check_noncircular+0x134/0x148
  __lock_acquire+0x13d0/0x217c
  lock_acquire+0x1e8/0x310
  console_flush_all+0x1f4/0x4fc
  console_unlock+0x5c/0x14c
  vprintk_emit+0x15c/0x3b0
  vprintk_default+0x38/0x44
  vprintk+0xc0/0xe4
  _printk+0x5c/0x84
  die_kernel_fault+0x48/0x37c
  __do_kernel_fault+0xd8/0x19c
  do_page_fault+0xac/0x6d8
  do_translation_fault+0xac/0xb8
  do_mem_abort+0x44/0x94
  el1_abort+0x44/0x70
  el1h_64_sync_handler+0xd8/0xe4
  el1h_64_sync+0x64/0x68
  __bitmap_and+0x4c/0x78
  select_task_rq_fair+0x724/0x1a30
  try_to_wake_up+0x17c/0x46c
  wake_up_process+0x18/0x24
  complete+0x58/0x8c
  __kthread_parkme+0x74/0xc8
  kthread_parkme+0x20/0x44
  smpboot_thread_fn+0x118/0x2a0
  kthread+0x124/0x128
  ret_from_fork+0x10/0x20
Mem abort info:
   ESR = 0x0000000096000006
   EC = 0x25: DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
   FSC = 0x06: level 2 translation fault
Data abort info:
   ISV = 0, ISS = 0x00000006
   CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp=000000002783e000
[0000000000000090] pgd=080000002738f003, p4d=080000002738f003, 
pud=0800000027a24003, pmd=0000000000000000
Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
Modules linked in: brcmfmac_wcc cpufreq_powersave cpufreq_conservative 
brcmfmac brcmutil cfg80211 crct10dif_ce hci_uart btqca btbcm bluetooth 
s5p_jpeg exynos_gsc s3fwrn5_i2c s3fwrn5 s5p_mfc nci v4l2_mem2mem 
ecdh_generic nfc ecc videobuf2_dma_contig videobuf2_memops 
videobuf2_v4l2 videodev rfkill panfrost videobuf2_common 
drm_shmem_helper gpu_sched mc ip_tables x_tables ipv6

CPU: 6 PID: 43 Comm: cpuhp/6 Not tainted 6.4.0-rc1+ #13640
Hardware name: Samsung TM2E board (DT)
pstate: 000000c5 (nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __bitmap_and+0x4c/0x78
lr : select_idle_cpu+0x64/0x450
sp : ffff80000bd83b50
x29: ffff80000bd83b50 x28: ffff80000a152ad8 x27: ffff00002836e500
x26: ffff00002814f600 x25: ffff80000ab43e78 x24: 0000000000000000
x23: ffff80000ab3f000 x22: 0000000000000000 x21: ffff80000ab43e78
x20: 0000000000000000 x19: 0000000000000000 x18: ffff8000099ac098
x17: 0000000000000000 x16: 0000000000000067 x15: 0000000000000001
x14: 0000000000000000 x13: 00000000000000d8 x12: 0000000000000000
x11: 0000000000000001 x10: ffff80000b7c6e90 x9 : 0000000000000000
x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000
x5 : 0000000000000020 x4 : 00000000000000ff x3 : 00000000fffffff8
x2 : ffff00002836e7e0 x1 : 0000000000000090 x0 : ffff0000d5fc2ad8
Call trace:
  __bitmap_and+0x4c/0x78
  select_task_rq_fair+0x724/0x1a30
  try_to_wake_up+0x17c/0x46c
  wake_up_process+0x18/0x24
  complete+0x58/0x8c
  __kthread_parkme+0x74/0xc8
  kthread_parkme+0x20/0x44
  smpboot_thread_fn+0x118/0x2a0
  kthread+0x124/0x128
  ret_from_fork+0x10/0x20
Code: 2a0803e8 4b0303e3 92800004 9ac32484 (f8687823)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
Kernel Offset: disabled
CPU features: 0x8c0004,1c780800,0000421b
Memory Limit: none
---[ end Kernel panic - not syncing: Oops: Fatal exception ]---

> ---
>   kernel/sched/fair.c     | 38 ++++++++++++++++++++++++++++++++++++++
>   kernel/sched/features.h |  1 +
>   2 files changed, 39 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 48b6f0c..0172458 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7028,6 +7028,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>   }
>   
>   /*
> + * For the multiple-LLC per node case, make sure to try the other LLC's if the
> + * local LLC comes up empty.
> + */
> +static int
> +select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
> +{
> +	struct sched_domain *parent = sd->parent;
> +	struct sched_group *sg;
> +
> +	/* Make sure to not cross nodes. */
> +	if (!parent || parent->flags & SD_NUMA)
> +		return -1;
> +
> +	sg = parent->groups;
> +	do {
> +		int cpu = cpumask_first(sched_group_span(sg));
> +		struct sched_domain *sd_child;
> +
> +		sd_child = per_cpu(sd_llc, cpu);
> +		if (sd_child != sd) {
> +			int i = select_idle_cpu(p, sd_child, test_idle_cores(cpu), cpu);
> +			if ((unsigned)i < nr_cpumask_bits)
> +				return i;
> +		}
> +
> +		sg = sg->next;
> +	} while (sg != parent->groups);
> +
> +	return -1;
> +}
> +
> +/*
>    * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
>    * the task fits. If no CPU is big enough, but there are idle ones, try to
>    * maximize capacity.
> @@ -7199,6 +7231,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   	if ((unsigned)i < nr_cpumask_bits)
>   		return i;
>   
> +	if (sched_feat(SIS_NODE)) {
> +		i = select_idle_node(p, sd, target);
> +		if ((unsigned)i < nr_cpumask_bits)
> +			return i;
> +	}
> +
>   	return target;
>   }
>   
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index ee7f23c..9e390eb 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>    */
>   SCHED_FEAT(SIS_PROP, false)
>   SCHED_FEAT(SIS_UTIL, true)
> +SCHED_FEAT(SIS_NODE, true)
>   
>   /*
>    * Issue a WARN when we do multiple update_rq_clock() calls

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland