Re: PROBLEM: double fault in md_end_io

Paweł Wiejacha <pawel.wiejacha@xxxxxxxxxxxx> · Tue, 13 Apr 2021 14:05:20 +0200

Hello Song,

That code does not compile, but I guess that what you meant was
something like this:

diff --git drivers/md/md.c drivers/md/md.c
index 04384452a..cbc97a96b 100644
--- drivers/md/md.c
+++ drivers/md/md.c
@@ -78,6 +78,7 @@ static DEFINE_SPINLOCK(pers_lock);

 static struct kobj_type md_ktype;

+struct kmem_cache *md_io_cache;
 struct md_cluster_operations *md_cluster_ops;
 EXPORT_SYMBOL(md_cluster_ops);
 static struct module *md_cluster_mod;
@@ -5701,8 +5702,8 @@ static int md_alloc(dev_t dev, char *name)
         */
        mddev->hold_active = UNTIL_STOP;

-   error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
-                     sizeof(struct md_io));
+   error = mempool_init_slab_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
+                     md_io_cache);
    if (error)
        goto abort;

@@ -9542,6 +9543,10 @@ static int __init md_init(void)
 {
    int ret = -ENOMEM;

+   md_io_cache = KMEM_CACHE(md_io, 0);
+   if (!md_io_cache)
+       goto err_md_io_cache;
+
    md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
    if (!md_wq)
        goto err_wq;
@@ -9578,6 +9583,8 @@ static int __init md_init(void)
 err_misc_wq:
    destroy_workqueue(md_wq);
 err_wq:
+   kmem_cache_destroy(md_io_cache);
+err_md_io_cache:
    return ret;
 }

@@ -9863,6 +9870,7 @@ static __exit void md_exit(void)
    destroy_workqueue(md_rdev_misc_wq);
    destroy_workqueue(md_misc_wq);
    destroy_workqueue(md_wq);
+   kmem_cache_destroy(md_io_cache);
 }

 subsys_initcall(md_init);

Unfortunately, now it crashes with:

<0>[  625.159576] traps: PANIC: double fault, error_code: 0x0
<4>[  625.159578] double fault: 0000 [#1] SMP NOPTI
<4>[  625.159579] CPU: 32 PID: 0 Comm: swapper/32 Tainted: P
OE     5.11.8-pw-fix1 #1
<4>[  625.159579] Hardware name: empty S8030GM2NE/S8030GM2NE, BIOS
V4.00 03/11/2021
<4>[  625.159580] RIP: 0010:__slab_free+0x26/0x380
<4>[  625.159580] Code: 1f 44 00 00 0f 1f 44 00 00 55 49 89 ca 45 89
c3 48 89 e5 41 57 41 56 49 89 fe 41 55 41 54 49 89 f4 53 48 83 e4 f0
48 83 ec 70 <48> 89 54 24 28 0f 1f 44 00 00 41 8b 46 28 4d 8b 6c 24 20
49 8b 5c
<4>[  625.159581] RSP: 0018:ffffb11c4d028fc0 EFLAGS: 00010086
<4>[  625.159582] RAX: ffff9a38cfc34fd0 RBX: ffff9a1a1360b870 RCX:
ffff9a1a1360b870
<4>[  625.159583] RDX: ffff9a1a1360b870 RSI: ffffe328044d82c0 RDI:
ffff9a39536e5e00
<4>[  625.159583] RBP: ffffb11c4d029060 R08: 0000000000000001 R09:
ffffffff9f850a57
<4>[  625.159583] R10: ffff9a1a1360b870 R11: 0000000000000001 R12:
ffffe328044d82c0
<4>[  625.159584] R13: ffffe328044d82c0 R14: ffff9a39536e5e00 R15:
ffff9a39536e5e00
<4>[  625.159584] FS:  0000000000000000(0000)
GS:ffff9a38cfc00000(0000) knlGS:0000000000000000
<4>[  625.159585] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4>[  625.159585] CR2: ffffb11c4d028fb8 CR3: 000000407aece000 CR4:
0000000000350ee0
<4>[  625.159585] Call Trace:
<4>[  625.159586]  <IRQ>
<4>[  625.159586]  ? kmem_cache_free+0x3bf/0x410
<4>[  625.159586]  ? mempool_free_slab+0x17/0x20
<4>[  625.159587]  ? mempool_free_slab+0x17/0x20
<4>[  625.159587]  ? mempool_free+0x2f/0x80
<4>[  625.159587]  ? md_end_io+0x4a/0x70
<4>[  625.159587]  ? bio_endio+0xe0/0x120
<4>[  625.159588]  ? bio_chain_endio+0x2d/0x40
<4>[  625.159588]  ? md_end_io+0x5c/0x70
<4>[  625.159588]  ? bio_endio+0xe0/0x120
<4>[  625.159589]  ? bio_chain_endio+0x2d/0x40
<4>[  625.159589]  ? md_end_io+0x5c/0x70
<4>[  625.159589]  ? bio_endio+0xe0/0x120
<4>[  625.159589]  ? bio_chain_endio+0x2d/0x40
<4>[  625.159590]  ? md_end_io+0x5c/0x70
<4>[  625.159590]  ? bio_endio+0xe0/0x120
...
<4>[  625.159663] Lost 344 message(s)!

Some info from a moment "just" (~0.2 s) before the crash:

$ cd /sys/dev/block/9:12 ;  watch -n0.2 'for f in `find . -maxdepth 1
-type f` `ls queue/* |grep -v usec` ; do echo $f `cat $f`; done | tee
-a ~/md'
./uevent MAJOR=9 MINOR=12 DEVNAME=md12 DEVTYPE=disk
./ext_range 256
./range 1
./alignment_offset 0
./dev 9:12
./ro 0
./stat 577 0 10446 84 46128254 0 11584858627 400313072 660 1289080
400313156 0 0 0 0 0 0
./events_poll_msecs -1
./events_async
./size 41986624512
./discard_alignment 0
./capability 50
./hidden 0
./removable 0
./events
./inflight 0 666
queue/add_random 0
queue/chunk_sectors 0
queue/dax 0
queue/discard_granularity 512
queue/discard_max_bytes 2199023255040
queue/discard_max_hw_bytes 2199023255040
queue/discard_zeroes_data 0
queue/fua 1
queue/hw_sector_size 512
queue/io_poll 0
queue/io_poll_delay 0
queue/iostats 0
queue/logical_block_size 512
queue/max_discard_segments 256
queue/max_hw_sectors_kb 128
queue/max_integrity_segments 0
queue/max_sectors_kb 128
queue/max_segments 33
queue/max_segment_size 4294967295
queue/minimum_io_size 524288
queue/nomerges 0
queue/nr_requests 128
queue/nr_zones 0
queue/optimal_io_size 1572864
queue/physical_block_size 512
queue/read_ahead_kb 3072
queue/rotational 0
queue/rq_affinity 0
queue/scheduler none
queue/stable_writes 0
queue/write_cache write back
queue/write_same_max_bytes 0
queue/write_zeroes_max_bytes 0
queue/zone_append_max_bytes 0
queue/zoned none


$ cd /sys/kernel/slab; watch -n0.2 'for f in `ls md_io/* | grep -v
calls`; do echo $f `cat $f`; done | tee -a ~/slab'
md_io/aliases 4
md_io/align 8
md_io/cache_dma 0
md_io/cpu_partial 30
md_io/cpu_slabs 625 N0=250 N1=84 N2=23 N3=268
md_io/ctor
md_io/destroy_by_rcu 0
md_io/hwcache_align 0
md_io/min_partial 5
md_io/objects 63852 N0=23256 N1=12138 N2=3264 N3=25194
md_io/object_size 40
md_io/objects_partial 0
md_io/objs_per_slab 102
md_io/order 0
md_io/partial 8 N0=3 N1=3 N3=2
md_io/poison 0
md_io/reclaim_account 0
md_io/red_zone 0
md_io/remote_node_defrag_ratio 100
md_io/sanity_checks 0
md_io/shrink
md_io/slabs 634 N0=231 N1=122 N2=32 N3=249
md_io/slabs_cpu_partial 559(559) C0=21(21) C1=16(16) C2=12(12)
C3=21(21) C4=7(7) C5=23(23) C6=19(19) C7=19(19) C8=5(5) C9=4(4)
C10=3(3) C12=5(5) C13=6(6) C14=6(6) C18=1(1) C19=2(2) C24=19(19)
C25=20(20) C26=15(15
) C27=11(11) C28=17(17) C29=14(14) C30=23(23) C31=14(14) C32=21(21)
C33=8(8) C34=19(19) C35=12(12) C36=19(19) C37=17(17) C38=15(15)
C39=7(7) C40=3(3) C41=3(3) C42=1(1) C45=3(3) C46=5(5) C47=1(1)
C48=1(1) C49=1(1)
 C53=1(1) C56=3(3) C57=5(5) C58=21(21) C59=25(25) C60=5(5) C61=25(25)
C62=17(17) C63=18(18)
md_io/slab_size 40
md_io/store_user 0
md_io/total_objects 64668 N0=23562 N1=12444 N2=3264 N3=25398
md_io/trace 0
md_io/usersize 0
md_io/validate


$ watch -n0.2 'cat /proc/meminfo | paste - - | tee -a ~/meminfo'
MemTotal:       528235648 kB    MemFree:        20002732 kB
MemAvailable:   483890268 kB    Buffers:            7356 kB
Cached:         495416180 kB    SwapCached:            0 kB
Active:         96396800 kB     Inactive:       399891308 kB
Active(anon):      10976 kB     Inactive(anon):   890908 kB
Active(file):   96385824 kB     Inactive(file): 399000400 kB
Unevictable:       78768 kB     Mlocked:           78768 kB
SwapTotal:             0 kB     SwapFree:              0 kB
Dirty:          88422072 kB     Writeback:        948756 kB
AnonPages:        945772 kB     Mapped:            57300 kB
Shmem:             26300 kB     KReclaimable:    7248160 kB
Slab:            7962748 kB     SReclaimable:    7248160 kB
SUnreclaim:       714588 kB     KernelStack:       18288 kB
PageTables:        10796 kB     NFS_Unstable:          0 kB
Bounce:                0 kB     WritebackTmp:          0 kB
CommitLimit:    264117824 kB    Committed_AS:   21816824 kB
VmallocTotal:   34359738367 kB  VmallocUsed:      561588 kB
VmallocChunk:          0 kB     Percpu:            65792 kB
HardwareCorrupted:     0 kB     AnonHugePages:         0 kB
ShmemHugePages:        0 kB     ShmemPmdMapped:        0 kB
FileHugePages:         0 kB     FilePmdMapped:         0 kB
HugePages_Total:       0        HugePages_Free:        0
HugePages_Rsvd:        0        HugePages_Surp:        0
Hugepagesize:       2048 kB     Hugetlb:               0 kB
DirectMap4k:      541000 kB     DirectMap2M:    11907072 kB
DirectMap1G:    525336576 kB


Regards,
Paweł Wiejacha.


On Mon, 12 Apr 2021 at 08:49, Song Liu <song@xxxxxxxxxx> wrote:
>
> On Fri, Apr 9, 2021 at 2:41 PM Paweł Wiejacha
> <pawel.wiejacha@xxxxxxxxxxxx> wrote:
> >
> > Hello,
> >
> > Two of my machines constantly crash with a double fault like this:
> >
> > 1146  <0>[33685.629591] traps: PANIC: double fault, error_code: 0x0
> > 1147  <4>[33685.629593] double fault: 0000 [#1] SMP NOPTI
> > 1148  <4>[33685.629594] CPU: 10 PID: 2118287 Comm: kworker/10:0
> > Tainted: P           OE     5.11.8-051108-generic #202103200636
> > 1149  <4>[33685.629595] Hardware name: ASUSTeK COMPUTER INC. KRPG-U8
> > Series/KRPG-U8 Series, BIOS 4201 09/25/2020
> > 1150  <4>[33685.629595] Workqueue: xfs-conv/md12 xfs_end_io [xfs]
> > 1151  <4>[33685.629596] RIP: 0010:__slab_free+0x23/0x340
> > 1152  <4>[33685.629597] Code: 4c fe ff ff 0f 1f 00 0f 1f 44 00 00 55
> > 48 89 e5 41 57 49 89 cf 41 56 49 89 fe 41 55 41 54 49 89 f4 53 48 83
> > e4 f0 48 83 ec 70 <48> 89 54 24 28 0f 1f 44 00 00 41 8b 46 28 4d 8b 6c
> > 24 20 49 8b 5c
> > 1153  <4>[33685.629598] RSP: 0018:ffffa9bc00848fa0 EFLAGS: 00010086
> > 1154  <4>[33685.629599] RAX: ffff94c04d8b10a0 RBX: ffff94437a34a880
> > RCX: ffff94437a34a880
> > 1155  <4>[33685.629599] RDX: ffff94437a34a880 RSI: ffffcec745e8d280
> > RDI: ffff944300043b00
> > 1156  <4>[33685.629599] RBP: ffffa9bc00849040 R08: 0000000000000001
> > R09: ffffffff82a5d6de
> > 1157  <4>[33685.629600] R10: 0000000000000001 R11: 000000009c109000
> > R12: ffffcec745e8d280
> > 1158  <4>[33685.629600] R13: ffff944300043b00 R14: ffff944300043b00
> > R15: ffff94437a34a880
> > 1159  <4>[33685.629601] FS:  0000000000000000(0000)
> > GS:ffff94c04d880000(0000) knlGS:0000000000000000
> > 1160  <4>[33685.629601] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > 1161  <4>[33685.629602] CR2: ffffa9bc00848f98 CR3: 000000014d04e000
> > CR4: 0000000000350ee0
> > 1162  <4>[33685.629602] Call Trace:
> > 1163  <4>[33685.629603]  <IRQ>
> > 1164  <4>[33685.629603]  ? kfree+0x3bc/0x3e0
> > 1165  <4>[33685.629603]  ? mempool_kfree+0xe/0x10
> > 1166  <4>[33685.629603]  ? mempool_kfree+0xe/0x10
> > 1167  <4>[33685.629604]  ? mempool_free+0x2f/0x80
> > 1168  <4>[33685.629604]  ? md_end_io+0x4a/0x70
> > 1169  <4>[33685.629604]  ? bio_endio+0xdc/0x130
> > 1170  <4>[33685.629605]  ? bio_chain_endio+0x2d/0x40
> > 1171  <4>[33685.629605]  ? md_end_io+0x5c/0x70
> > 1172  <4>[33685.629605]  ? bio_endio+0xdc/0x130
> > 1173  <4>[33685.629605]  ? bio_chain_endio+0x2d/0x40
> > 1174  <4>[33685.629606]  ? md_end_io+0x5c/0x70
> > 1175  <4>[33685.629606]  ? bio_endio+0xdc/0x130
> > 1176  <4>[33685.629606]  ? bio_chain_endio+0x2d/0x40
> > 1177  <4>[33685.629607]  ? md_end_io+0x5c/0x70
> > ... repeated ...
> > 1436  <4>[33685.629677]  ? bio_endio+0xdc/0x130
> > 1437  <4>[33685.629677]  ? bio_chain_endio+0x2d/0x40
> > 1438  <4>[33685.629677]  ? md_end_io+0x5c/0x70
> > 1439  <4>[33685.629677]  ? bio_endio+0xdc/0x130
> > 1440  <4>[33685.629678]  ? bio_chain_endio+0x2d/0x40
> > 1441  <4>[33685.629678]  ? md_
> > 1442  <4>[33685.629679] Lost 357 message(s)!
> >
> > This happens on:
> > 5.11.8-051108-generic #202103200636 SMP Sat Mar 20 11:17:32 UTC 2021
> > and on 5.8.0-44-generic #50~20.04.1-Ubuntu
> > (https://changelogs.ubuntu.com/changelogs/pool/main/l/linux/linux_5.8.0-44.50/changelog)
> > which contains backported
> > https://github.com/torvalds/linux/commit/41d2d848e5c09209bdb57ff9c0ca34075e22783d
> > ("md: improve io stats accounting").
> > The 5.8.18-050818-generic #202011011237 SMP Sun Nov 1 12:40:15 UTC
> > 2020 which does not contain above suspected change does not crash.
> >
> > If there's a better way/place to report this bug just let me know. If
> > not, here are steps to reproduce:
> >
> > 1. Create a RAID 0 device using three Micron_9300_MTFDHAL7T6TDP disks.
> > mdadm --create --verbose /dev/md12 --level=stripe --raid-devices=3
> > /dev/nvme0n1p1 /dev/nvme1n1p1 /dev/nvme2n1p1
> >
> > 2. Setup xfs on it:
> > mkfs.xfs /dev/md12 and mount it
> >
> > 3. Write to a file on this filesystem:
> > while true; do rm -rf /mnt/md12/crash* ; for i in `seq 8`; do dd
> > if=/dev/zero of=/mnt/md12/crash$i bs=32K count=50000000 & done; wait;
> > done
> > Wait for a crash (usually less than 20 min).
> >
> > I couldn't reproduce it with a single dd process (maybe I have to wait
> > a little longer), but a single cat
> > /very/large/file/on/cephfs/over100GbE > /mnt/md12/crash is enough for
> > this double fault to occur.
> >
> > More info:
> > This long mempool_kfree - md_end_io - *  -md_end_io stack trace looks
> > always the same, but the panic occurs in different places:
> >
> > pstore/6948115143318/dmesg.txt-<4>[545649.087998] CPU: 88 PID: 0 Comm:
> > swapper/88 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6948377398316/dmesg.txt-<4>[11275.914909] CPU: 14 PID: 0 Comm:
> > swapper/14 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6948532816002/dmesg.txt-<4>[33685.629594] CPU: 10 PID: 2118287
> > Comm: kworker/10:0 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6948532816002/dmesg.txt-<4>[33685.629595] Workqueue:
> > xfs-conv/md12 xfs_end_io [xfs]
> > pstore/6948855849083/dmesg.txt-<4>[42934.321129] CPU: 85 PID: 0 Comm:
> > swapper/85 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6948876331782/dmesg.txt-<4>[ 3475.020672] CPU: 86 PID: 0 Comm:
> > swapper/86 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6949083860307/dmesg.txt-<4>[43048.254375] CPU: 45 PID: 0 Comm:
> > swapper/45 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6949091775931/dmesg.txt-<4>[ 1150.790240] CPU: 64 PID: 0 Comm:
> > swapper/64 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6949123356826/dmesg.txt-<4>[ 6963.858253] CPU: 6 PID: 51 Comm:
> > kworker/6:0 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6949123356826/dmesg.txt-<4>[ 6963.858255] Workqueue: ceph-msgr
> > ceph_con_workfn [libceph]
> > pstore/6949123356826/dmesg.txt-<4>[ 6963.858253] CPU: 6 PID: 51 Comm:
> > kworker/6:0 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> > pstore/6949123356826/dmesg.txt-<4>[ 6963.858255] Workqueue: ceph-msgr
> > ceph_con_workfn [libceph]
> > pstore/6949152322085/dmesg.txt-<4>[ 6437.077874] CPU: 59 PID: 0 Comm:
> > swapper/59 Tainted: P           OE     5.11.8-051108-generic
> > #202103200636
> >
> > cat /proc/cmdline
> > BOOT_IMAGE=/vmlinuz-5.11.8-051108-generic
> > root=/dev/mapper/ubuntu--vg-ubuntu--lv ro net.ifnames=0 biosdevname=0
> > strict-devmem=0 mitigations=off iommu=pt
> >
> > cat /proc/cpuinfo
> > model name      : AMD EPYC 7552 48-Core Processor
> >
> > cat /proc/mounts
> > /dev/md12 /mnt/ssd1 xfs
> > rw,noatime,attr2,inode64,logbufs=8,logbsize=32k,sunit=1024,swidth=3072,prjquota
> > 0 0
> >
> > Let me know if you need more information.
>
> Hi Pawel,
>
> Thanks for the report. Could you please try whether the following
> change fixes the issue?
>
> Song
>
> diff --git i/drivers/md/md.c w/drivers/md/md.c
> index bb09070a63bcf..889e9440a9f38 100644
> --- i/drivers/md/md.c
> +++ w/drivers/md/md.c
> @@ -5648,7 +5648,7 @@ static int md_alloc(dev_t dev, char *name)
>                  */
>                 mddev->hold_active = UNTIL_STOP;
>
> -       error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
> +       error = mempool_init_slab_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
>                                           sizeof(struct md_io));
>         if (error)
>                 goto abort;