On Sun, 28 Feb 2021 18:22:21 +0500 Mikhail Gavrilov wrote: > On Sat, 13 Feb 2021 at 08:03, Hillf Danton <hdanton@xxxxxxxx> wrote: > > > > The comment below shows a race instance, though I failed to put things > > together to see how within two hours. Cut it and see what will come up. > > > > --- a/mm/z3fold.c > > +++ b/mm/z3fold.c > > @@ -1129,19 +1129,22 @@ retry: > > page = NULL; > > if (can_sleep) { > > spin_lock(&pool->stale_lock); > > + spin_lock(&pool->lock); > > zhdr = list_first_entry_or_null(&pool->stale, > > struct z3fold_header, buddy); > > /* > > - * Before allocating a page, let's see if we can take one from > > + * Before allocating a page, lets see if we can take one from > > * the stale pages list. cancel_work_sync() can sleep so we > > * limit this case to the contexts where we can sleep > > */ > > if (zhdr) { > > list_del(&zhdr->buddy); > > + spin_unlock(&pool->lock); > > spin_unlock(&pool->stale_lock); > > cancel_work_sync(&zhdr->work); > > page = virt_to_page(zhdr); > > } else { > > + spin_unlock(&pool->lock); > > spin_unlock(&pool->stale_lock); > > } > > } > > > Hi, > It happened again with the patch above. Thanks again. > Is anything cleared up now? See below. > > [32451.229358] list_add corruption. next->prev should be prev > (ffffd08fbc661cd0), but was ffffffffa7643650. (next=ffff9e4d2848f1c0). > [32451.229395] ------------[ cut here ]------------ > [32451.229398] kernel BUG at lib/list_debug.c:23! > [32451.229408] invalid opcode: 0000 [#1] SMP NOPTI > [32451.229414] CPU: 4 PID: 80665 Comm: kworker/u64:0 Tainted: G > W --------- --- 5.11.0-155.fc35.x86_64+debug #1 > [32451.229420] Hardware name: System manufacturer System Product > Name/ROG STRIX X570-I GAMING, BIOS 3402 01/13/2021 > [32451.229424] Workqueue: zswap3 compact_page_work > [32451.229433] RIP: 0010:__list_add_valid.cold+0xf/0x3f > [32451.229439] Code: 48 c7 c6 24 26 64 a8 48 89 ef 49 c7 c7 ea ff ff > ff e8 e8 71 01 00 e9 fa 10 9e ff 4c 89 c1 48 c7 c7 f0 26 64 a8 e8 50 > 12 fe ff <0f> 0b 48 89 f2 4c 89 c1 48 89 fe 48 c7 c7 a0 27 64 a8 e8 39 > 12 fe > [32451.229444] RSP: 0018:ffffb08fd553fde0 EFLAGS: 00010286 > [32451.229449] RAX: 0000000000000075 RBX: ffffe16d871550c0 RCX: 0000000000000000 > [32451.229453] RDX: ffff9e53c73e9f60 RSI: ffff9e53c73db2a0 RDI: ffff9e53c73db2a0 > [32451.229457] RBP: ffffd08fbc661cd0 R08: 0000000000000000 R09: ffffb08fd553fc20 > [32451.229460] R10: ffffb08fd553fc18 R11: 0000000000000000 R12: ffff9e4ce4e29008 > [32451.229464] R13: ffff9e4d85543010 R14: ffff9e4d2848f1c0 R15: ffff9e4d85543000 > [32451.229468] FS: 0000000000000000(0000) GS:ffff9e53c7200000(0000) > knlGS:0000000000000000 > [32451.229472] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [32451.229476] CR2: 00000fb21455dfe8 CR3: 0000000142968000 CR4: 0000000000350ee0 > [32451.229480] Call Trace: > [32451.229485] do_compact_page+0x28d/0xb60 > [32451.229492] ? debug_object_deactivate+0x55/0x140 > [32451.229499] ? lock_release+0x1e9/0x400 > [32451.229505] ? lock_release+0x1e9/0x400 > [32451.229511] process_one_work+0x2b0/0x5e0 > [32451.229519] worker_thread+0x55/0x3c0 > [32451.229524] ? process_one_work+0x5e0/0x5e0 > [32451.229531] kthread+0x13a/0x150 > [32451.229540] ? __kthread_bind_mask+0x60/0x60 > [32451.229548] ret_from_fork+0x22/0x30 > [32451.229558] Modules linked in: snd_seq_dummy snd_hrtimer tun uinput > nls_utf8 isofs rfcomm netconsole nft_objref nf_conntrack_netbios_ns > nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib > nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct > nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set > nf_tables nfnetlink cmac bnep zstd sunrpc xpad ff_memless vfat fat > hid_logitech_hidpp hid_logitech_dj joydev snd_hda_codec_realtek > snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi uvcvideo > videobuf2_vmalloc snd_hda_intel videobuf2_memops videobuf2_v4l2 > snd_intel_dspcfg mt76x2u soundwire_intel mt76x2_common > videobuf2_common mt76x02_usb soundwire_generic_allocation mt76_usb > intel_rapl_msr intel_rapl_common iwlmvm snd_soc_core snd_usb_audio > videodev snd_compress mt76x02_lib btusb snd_usbmidi_lib > snd_pcm_dmaengine mt76 soundwire_cadence btrtl snd_rawmidi btbcm > snd_hda_codec mc btintel edac_mce_amd mac80211 kvm_amd snd_hda_core > uas bluetooth ac97_bus > [32451.229628] iwlwifi snd_hwdep usb_storage kvm snd_seq libarc4 > snd_seq_device ecdh_generic cfg80211 ecc irqbypass snd_pcm rapl > eeepc_wmi asus_wmi sparse_keymap snd_timer rfkill snd video sp5100_tco > wmi_bmof i2c_piix4 k10temp soundcore acpi_cpufreq binfmt_misc > ip_tables amdgpu drm_ttm_helper ttm iommu_v2 gpu_sched > crct10dif_pclmul crc32_pclmul drm_kms_helper crc32c_intel cec drm igb > ghash_clmulni_intel nvme ccp dca nvme_core i2c_algo_bit wmi > pinctrl_amd fuse > [32451.229696] ---[ end trace 80d86d6942435514 ]--- [...] > Full kernel log is here: https://pastebin.com/4SbhNp7V > > -- > Best Regards, > Mike Gavrilov. What we learn from your reports is 1/ in z3fold_free(), kref_put() creates the ground zero for the race cases reported, 2/ the stale_lock in combination with lock makes things more complicated than thought. Instead of dropping something in the zero spot, the fix below goes the road mentioned before in this mail thread - add another list_head in parallel to the buddy and s/buddy/stale_node/ under every case of stale_lock. --- x/mm/z3fold.c +++ y/mm/z3fold.c @@ -127,6 +127,7 @@ struct z3fold_header { unsigned short first_num:2; unsigned short mapped_count:2; unsigned short foreign_handles:2; + struct list_head stale_node; }; /** @@ -429,6 +430,7 @@ static struct z3fold_header *init_z3fold zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->stale_node); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } @@ -556,7 +558,7 @@ static void __release_z3fold_page(struct z3fold_page_unlock(zhdr); spin_lock(&pool->stale_lock); - list_add(&zhdr->buddy, &pool->stale); + list_add(&zhdr->stale_node, &pool->stale); queue_work(pool->release_wq, &pool->work); spin_unlock(&pool->stale_lock); } @@ -598,10 +600,10 @@ static void free_pages_work(struct work_ spin_lock(&pool->stale_lock); while (!list_empty(&pool->stale)) { struct z3fold_header *zhdr = list_first_entry(&pool->stale, - struct z3fold_header, buddy); + struct z3fold_header, stale_node); struct page *page = virt_to_page(zhdr); - list_del(&zhdr->buddy); + list_del(&zhdr->stale_node); if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) continue; spin_unlock(&pool->stale_lock); @@ -1140,14 +1142,14 @@ retry: if (can_sleep) { spin_lock(&pool->stale_lock); zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); + struct z3fold_header, stale_node); /* * Before allocating a page, let's see if we can take one from * the stale pages list. cancel_work_sync() can sleep so we * limit this case to the contexts where we can sleep */ if (zhdr) { - list_del(&zhdr->buddy); + list_del(&zhdr->stale_node); spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); page = virt_to_page(zhdr); --