Re: Known and unfixed active data loss bug in MM + XFS with large folios since Dec 2021 (any kernel from 6.1 upwards)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 9/17/24 5:32 AM, Matthew Wilcox wrote:
> On Mon, Sep 16, 2024 at 10:47:10AM +0200, Chris Mason wrote:
>> I've got a bunch of assertions around incorrect folio->mapping and I'm
>> trying to bash on the ENOMEM for readahead case.  There's a GFP_NOWARN
>> on those, and our systems do run pretty short on ram, so it feels right
>> at least.  We'll see.
> 
> I've been running with some variant of this patch the whole way across
> the Atlantic, and not hit any problems.  But maybe with the right
> workload ...?
> 
> There are two things being tested here.  One is whether we have a
> cross-linked node (ie a node that's in two trees at the same time).
> The other is whether the slab allocator is giving us a node that already
> contains non-NULL entries.
> 
> If you could throw this on top of your kernel, we might stand a chance
> of catching the problem sooner.  If it is one of these problems and not
> something weirder.
> 

This fires in roughly 10 seconds for me on top of v6.11.  Since array seems
to always be 1, I'm not sure if the assertion is right, but hopefully you
can trigger yourself.

reader.c is attached.  It just has one thread doing large reads and two
threads fadvising things away.  The important part seems to be two threads
in parallel calling fadvise DONTNEED at the same time, just one thread
wasn't enough.

root@kerneltest003-kvm ~]# cat small.sh
#!/bin/bash

mkfs.xfs -f /dev/vdb
mount /dev/vdb /xfs
fallocate -l10g /xfs/file1
./reader /xfs/file1
[root@kerneltest003-kvm ~]# ./small.sh
meta-data=/dev/vdb               isize=512    agcount=10, agsize=268435455 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=0
         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=2684354550, imaxpct=5
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=521728, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
Discarding blocks...Done.
[  102.013720] XFS (vdb): Mounting V5 Filesystem c3531255-dee1-4b86-8e14-2baa3cc900f8
[  102.029638] XFS (vdb): Ending clean mount
[  104.204205] node ffff888119f86ba8 offset 13 parent ffff888119f84988 shift 6 count 0 values 0 array 0000000000000001 list ffffffff81f93230 0000000000000000 marks 0 0 0
+[  104.206996] ------------[ cut here ]------------
[  104.207948] kernel BUG at lib/xarray.c:211!
[  104.208729] Oops: invalid opcode: 0000 [#1] SMP PTI
[  104.209627] CPU: 51 UID: 0 PID: 862 Comm: reader Not tainted 6.11.0-dirty #24
[  104.211232] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[  104.213402] RIP: 0010:xas_load+0xe4/0x120
[  104.214144] Code: 00 10 00 00 76 c4 48 83 fa 02 75 ad 41 b8 02 04 00 00 eb a5 40 f6 c6 03 75 12 48 89 f7 e8 44 f5 ff ff 0f 0b 49 83 f8 02 75 10 <0f> 0b 48 c7 c7 76 58 98 82 e8 7e 3b 1a ff eb e8 40 f6 c6 03 75 0a
[  104.217593] RSP: 0018:ffffc90001b57b90 EFLAGS: 00010296
[  104.218729] RAX: 0000000000000000 RBX: ffffc90001b57bc8 RCX: 0000000000000000
[  104.220019] RDX: ffff88b177aee180 RSI: ffff88b177ae0b80 RDI: ffff88b177ae0b80
[  104.221394] RBP: 000000000027ffff R08: ffffffff8396b4a8 R09: 0000000000000003
[  104.222679] R10: ffffffff8326b4c0 R11: ffffffff837eb4c0 R12: ffffc90001b57d48
[  104.223985] R13: ffffc90001b57c48 R14: ffffc90001b57c50 R15: 0000000000000000
[  104.225277] FS:  00007fcee02006c0(0000) GS:ffff88b177ac0000(0000) knlGS:0000000000000000
[  104.226726] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  104.227768] CR2: 00007fcee01fff78 CR3: 000000011bdc2004 CR4: 0000000000770ef0
[  104.229055] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  104.230341] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  104.231625] PKRU: 55555554
[  104.232131] Call Trace:
[  104.232586]  <TASK>
[  104.232984]  ? die+0x33/0x90
[  104.233531]  ? do_trap+0xda/0x100
[  104.234206]  ? do_error_trap+0x65/0x80
[  104.234893]  ? xas_load+0xe4/0x120
[  104.235524]  ? exc_invalid_op+0x4e/0x70
[  104.236231]  ? xas_load+0xe4/0x120
[  104.236855]  ? asm_exc_invalid_op+0x16/0x20
[  104.237638]  ? xas_load+0xe4/0x120
[  104.238268]  xas_find+0x18c/0x1f0
[  104.238878]  find_lock_entries+0x6d/0x2f0
[  104.239617]  mapping_try_invalidate+0x5e/0x150
[  104.240432]  ? update_load_avg+0x78/0x750
[  104.241167]  ? psi_group_change+0x122/0x310
[  104.241929]  ? sched_balance_newidle+0x306/0x3b0
[  104.242770]  ? psi_task_switch+0xd6/0x230
[  104.243506]  ? __switch_to_asm+0x2a/0x60
[  104.244224]  ? __schedule+0x316/0xa00
[  104.244896]  ? schedule+0x1c/0xd0
[  104.245530]  ? schedule_preempt_disabled+0xa/0x10
[  104.246386]  ? __mutex_lock.constprop.0+0x2cf/0x5a0
[  104.247274]  ? __lru_add_drain_all+0x150/0x1e0
[  104.248089]  generic_fadvise+0x230/0x280
[  104.248802]  ? __fdget+0x8c/0xe0
[  104.249407]  ksys_fadvise64_64+0x4c/0xa0
[  104.250126]  __x64_sys_fadvise64+0x18/0x20
[  104.250868]  do_syscall_64+0x5b/0x170
[  104.251543]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  104.252463] RIP: 0033:0x7fcee0e5cd6e
[  104.253131] Code: b8 ff ff ff ff eb c3 67 e8 7f cf 01 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 f3 0f 1e fa 41 89 ca b8 dd 00 00 00 0f 05 <89> c2 f7 da 3d 00 f0 ff ff b8 00 00 00 00 0f 47 c2 c3 41 57 41 56
[  104.256446] RSP: 002b:00007fcee01ffe88 EFLAGS: 00000202 ORIG_RAX: 00000000000000dd
[  104.257800] RAX: ffffffffffffffda RBX: 00007fcee0200cdc RCX: 00007fcee0e5cd6e
[  104.259085] RDX: 0000000280000000 RSI: 0000000000000000 RDI: 0000000000000003
[  104.260365] RBP: 00007fcee01ffed0 R08: 0000000000000000 R09: 00007fcee02006c0
[  104.261648] R10: 0000000000000004 R11: 0000000000000202 R12: ffffffffffffff88
[  104.262964] R13: 0000000000000000 R14: 00007ffc16078a70 R15: 00007fcedfa00000
[  104.264258]  </TASK>
[  104.264669] Modules linked in: intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel bochs drm_vram_helper drm_kms_helper kvm drm_ttm_helper intel_agp ttm i2c_piix4 intel_gtt agpgart i2c_smbus evdev button serio_raw sch_fq_codel usbip_core drm loop drm_panel_orientation_quirks backlight bpf_preload virtio_rng ip_tables autofs4
[  104.270152] ---[ end trace 0000000000000000 ]---
[  104.271179] RIP: 0010:xas_load+0xe4/0x120
[  104.271968] Code: 00 10 00 00 76 c4 48 83 fa 02 75 ad 41 b8 02 04 00 00 eb a5 40 f6 c6 03 75 12 48 89 f7 e8 44 f5 ff ff 0f 0b 49 83 f8 02 75 10 <0f> 0b 48 c7 c7 76 58 98 82 e8 7e 3b 1a ff eb e8 40 f6 c6 03 75 0a
[  104.275460] RSP: 0018:ffffc90001b57b90 EFLAGS: 00010296
[  104.276481] RAX: 0000000000000000 RBX: ffffc90001b57bc8 RCX: 0000000000000000
[  104.277797] RDX: ffff88b177aee180 RSI: ffff88b177ae0b80 RDI: ffff88b177ae0b80
[  104.279101] RBP: 000000000027ffff R08: ffffffff8396b4a8 R09: 0000000000000003
[  104.280400] R10: ffffffff8326b4c0 R11: ffffffff837eb4c0 R12: ffffc90001b57d48
[  104.281705] R13: ffffc90001b57c48 R14: ffffc90001b57c50 R15: 0000000000000000
[  104.283014] FS:  00007fcee02006c0(0000) GS:ffff88b177ac0000(0000) knlGS:0000000000000000
[  104.284487] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  104.285539] CR2: 00007fcee01fff78 CR3: 000000011bdc2004 CR4: 0000000000770ef0
[  104.286838] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  104.288139] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  104.289468] PKRU: 55555554
[  104.289983] Kernel panic - not syncing: Fatal exception
[  104.292343] Kernel Offset: disabled
[  104.292990] ---[ end Kernel panic - not syncing: Fatal exception ]---
/*
 * gcc -Wall -o reader reader.c -lpthread
 */
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/sendfile.h>
#include <unistd.h>
#include <errno.h>
#include <err.h>
#include <pthread.h>

struct thread_data {
	int fd;
	size_t size;
};

static void *drop_pages(void *arg)
{
	struct thread_data *td = arg;
	int ret;
	unsigned long nr_pages = td->size / 4096;
	unsigned int seed = 0x55443322;
	off_t offset;
	unsigned long nr_drops = 0;

	while (1) {
		offset = rand_r(&seed) % nr_pages;
		offset = offset * 4096;
		ret = posix_fadvise(td->fd,  offset, 4096, POSIX_FADV_DONTNEED);
		if (ret < 0)
			err(1, "fadvise dontneed");

		/* every once and a while, drop everything */
		if (nr_drops > nr_pages / 2) {
			ret = posix_fadvise(td->fd,  0, td->size, POSIX_FADV_DONTNEED);
			if (ret < 0)
				err(1, "fadvise dontneed");
			fprintf(stderr, "+");
			nr_drops = 0;
		}
		nr_drops++;
	}
	return NULL;
}

#define READ_BUF (2 * 1024 * 1024)
static void *read_pages(void *arg)
{
	struct thread_data *td = arg;
	char buf[READ_BUF];
	ssize_t ret;
	loff_t offset;

	while (1) {
		offset = 0;
		while(offset < td->size) {
			ret = pread(td->fd, buf, READ_BUF, offset);
			if (ret < 0)
				err(1, "read");
			if (ret == 0)
				break;
			offset += ret;
		}
	}
	return NULL;
}

int main(int ac, char **av)
{
	int fd;
	int ret;
	struct stat st;
	struct thread_data td;
	pthread_t drop_tid;
	pthread_t drop2_tid;
	pthread_t read_tid;

	if (ac != 2)
		err(1, "usage: reader filename\n");

	fd = open(av[1], O_RDONLY, 0600);
	if (fd < 0)
		err(1, "unable to open %s", av[1]);

	ret = fstat(fd, &st);
	if (ret < 0)
		err(1, "stat");

	td.fd = fd;
	td.size = st.st_size;

	ret = pthread_create(&drop_tid, NULL, drop_pages, &td);
	if (ret)
		err(1, "pthread_create");
	ret = pthread_create(&drop2_tid, NULL, drop_pages, &td);
	if (ret)
		err(1, "pthread_create");
	ret = pthread_create(&read_tid, NULL, read_pages, &td);
	if (ret)
		err(1, "pthread_create");

	pthread_join(drop_tid, NULL);
	pthread_join(drop2_tid, NULL);
	pthread_join(read_tid, NULL);
}

[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux