On 06/14/2011 11:29 AM, Lino Sanfilippo wrote: > Hi Eric, > > After the patch series "decouple mark lock and marks fsobject lock" > I sent on Feb. 21 this is another attempt to change the locking order > used in fsnotify from > > mark->lock > group->mark_lock > inode/mnt->lock > to > group->mark_lock > mark->lock > inode/mnt->lock > > The former series still contained some flaws: > 1. inodes/mounts that have marks and are not pinned could dissappear while > another thread still wants to access a marks inode/mount (see https://lkml.org/lkml/2011/3/1/373) > 2. in the new introduced function remove_mark_locked() a group could be freed > while the groups mark mutex is held > > Both issues should be fixed with this series. > > The reason for changing the locking order is, as you may remember, that there are > still some races when adding/removing marks to a group > (see also https://lkml.org/lkml/2010/11/30/189). > > So what this series does is change the locking order (PATCH 4) to > > group->mark_mutex > mark->lock > inode/mnt->lock > > Beside this the group mark_lock is turned into a mutex (PATCH 6), which allows us to > call functions that may sleep while this lock is held. > > At some places there is only a mark and no group available > (i.e. clear_marks_by_[inode|mount]()), so we first have to get the group from the mark > to use the group->mark_mutex (PATCH 7). > > Since we cant get a group from a mark and use it without the danger that the group is > unregistered and freed, reference counting for groups is introduced and used > (PATCHES 1 to 3) for each mark that is added to the group. > With this freeing a group does not longer depend on the number of marks, but is > simply destroyed when the reference counter hits 0. > > Since a group ref is now taken for each mark that is added to the group we first > have to explicitly call fsnotify_destroy_group() to remove all marks from the group > and release all group references held by those marks. This will also release the > - possibly final - ref of the group held by the caller (PATCH 1). > > Furthermore we now can take the mark lock with the group mutex held so we dont need an > extra "clear list" any more if we clear all marks by group (PATCH 9). > > For [add|remove]_mark() locked versions are introduced (PATCH 8) that can be > used if a caller already has the mark lock held. Thus we now have the possibility > to use the groups mark mutex to also synchronize addition/removal of a mark or to > replace the dnotify mutex. > This is not part of these patches. It would be the next step if these patches are > accepted to be merged. > > This is against 2.6.39 I finally built and tested a v3.0 kernel with these patches (I know I'm SOOOOOO far behind). Not what I hoped for: > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day... > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070 > [ 150.946012] IP: [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0 > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > [ 150.946012] CPU 0 > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan] > [ 150.946012] > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM > [ 150.946012] RIP: 0010:[<ffffffff810ffd58>] [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282 > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438 > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240 > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000 > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428 > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610 > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000 > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0 > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760) > [ 150.946012] Stack: > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76 > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250 > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438 > [ 150.946012] Call Trace: > [ 150.946012] [<ffffffff81102f76>] shmem_evict_inode+0x76/0x130 > [ 150.946012] [<ffffffff8115e9be>] evict+0x7e/0x170 > [ 150.946012] [<ffffffff8115ee40>] iput_final+0xd0/0x190 > [ 150.946012] [<ffffffff8115ef33>] iput+0x33/0x40 > [ 150.946012] [<ffffffff81180205>] fsnotify_destroy_mark_locked+0x145/0x160 > [ 150.946012] [<ffffffff81180316>] fsnotify_destroy_mark+0x36/0x50 > [ 150.946012] [<ffffffff81181937>] sys_inotify_rm_watch+0x77/0xd0 > [ 150.946012] [<ffffffff815aca52>] system_call_fastpath+0x16/0x1b > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00 > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a > [ 150.946012] RIP [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50 > [ 150.946012] RSP <ffff88002c2e5df8> > [ 150.946012] CR2: 0000000000000070 Looks at aweful lot like the problem from: http://www.spinics.net/lists/linux-fsdevel/msg46101.html Latest stress test program attached..... -Eric
#define _GNU_SOURCE #include <errno.h> #include <fcntl.h> #include <limits.h> #include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <sys/inotify.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/types.h> #include <unistd.h> #define NUM_CORES 2 #define NUM_DATA_DUMPERS 4 #define WATCHER_MULTIPLIER 4 #define NUM_WATCHER_THREADS NUM_CORES #define NUM_CLOSER_THREADS NUM_WATCHER_THREADS * 2 #define NUM_ZERO_CLOSERS 1 #define NUM_FILE_CREATERS 2 #define NUM_INOTIFY_INSTANCES 2 #define TMP_DIR_NAME "/tmp/inotify_syscall_thrash" struct watcher_struct { int inotify_fd; int file_num; }; struct operator_struct { int inotify_fd; }; static int stopped = 0; static int high_wd = 0; static int low_wd = INT_MAX; void *add_watches(void *ptr); void *close_watches(void *ptr); void *zero_close_watches(void *ptr); void *dump_data(void *ptr); void *reset_low_wd(void *ptr); void *create_files(void *ptr); void *mount_tmpdir(void *ptr); void sigfunc(int sig_num); int handle_error(const char *arg) { perror(arg); exit(1); } int main(void) { int inotify_fd[NUM_INOTIFY_INSTANCES]; struct watcher_struct *watcher_arg; struct operator_struct *operator_arg; pthread_t *watchers; pthread_t *closers; pthread_t *zero_closers; pthread_t *data_dumpers; pthread_t *file_creaters; pthread_t low_wd_reseter; pthread_t mounter; pthread_attr_t attr; int iret; void *ret; int i, j, k; struct sigaction setmask; /* close cleanly on cntl+c */ sigemptyset( &setmask.sa_mask ); setmask.sa_handler = sigfunc; setmask.sa_flags = 0; sigaction( SIGINT, &setmask, (struct sigaction *) NULL ); /* create and inotify instance an make it O_NONBLOCK */ for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) { int fd = inotify_init(); if (fd < 0) handle_error("opening inotify_fd"); iret = fcntl(fd, F_SETFL, O_NONBLOCK); if (iret) handle_error("setting NONBLOCK"); inotify_fd[i] = fd; } /* make sure the directory exists */ mkdir(TMP_DIR_NAME, S_IRWXU); /* set up a pthread attr with a tiny stack */ iret = pthread_attr_init(&attr); if (iret) handle_error("pthread_attr_init"); iret = pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN*2); if (iret) handle_error("pthread_attr_setstacksize"); /* watchers need to know what file to pay with, so we need and argument */ watcher_arg = calloc(NUM_INOTIFY_INSTANCES * NUM_WATCHER_THREADS, sizeof(struct watcher_struct)); if (!watcher_arg) handle_error("allocating watcher_arg"); operator_arg = calloc(NUM_INOTIFY_INSTANCES, sizeof(struct operator_struct)); if (!operator_arg) handle_error("allocating operator_arg"); /* allocate the pthread_t's for all of the threads */ watchers = calloc(NUM_INOTIFY_INSTANCES * NUM_WATCHER_THREADS * WATCHER_MULTIPLIER, sizeof(pthread_t)); if (!watchers) handle_error("allocating watchers"); closers = calloc(NUM_INOTIFY_INSTANCES * NUM_CLOSER_THREADS, sizeof(pthread_t)); if (!closers) handle_error("allocating closers"); zero_closers = calloc(NUM_INOTIFY_INSTANCES * NUM_ZERO_CLOSERS, sizeof(pthread_t)); if (!zero_closers) handle_error("allocating zero_closers"); data_dumpers = calloc(NUM_INOTIFY_INSTANCES * NUM_DATA_DUMPERS, sizeof(pthread_t)); if (!data_dumpers) handle_error("allocating data_dumpers"); file_creaters = calloc(NUM_FILE_CREATERS, sizeof(*file_creaters)); if (!file_creaters) handle_error("allocating file_creaters"); /* create a thread that does nothing but reset the low_wd */ iret = pthread_create(&low_wd_reseter, &attr, reset_low_wd, NULL); if (iret) handle_error("low_wd_reseter"); iret = pthread_create(&mounter, &attr, mount_tmpdir, NULL); if (iret) handle_error("low_wd_reseter"); /* create WATCHER_MULTIPLIER threads per file which do nothing * but try to add a watch for each INOTIFY_INSTANCE */ for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) { for (j = 0; j < NUM_WATCHER_THREADS; j++) { watcher_arg[i * NUM_WATCHER_THREADS + j].file_num = j; watcher_arg[i * NUM_WATCHER_THREADS + j].inotify_fd = inotify_fd[i]; for (k = 0; k < WATCHER_MULTIPLIER; k++) { iret = pthread_create(&watchers[i * (NUM_WATCHER_THREADS * WATCHER_MULTIPLIER) + (j * WATCHER_MULTIPLIER) + k], &attr, add_watches, &watcher_arg[i * NUM_WATCHER_THREADS + j]); if (iret) handle_error("creating water threads"); } } } for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) operator_arg[i].inotify_fd = inotify_fd[i]; /* create threads which unlink and then recreate all of the files in question */ for (i = 0; i < NUM_FILE_CREATERS; i++) { iret = pthread_create( &file_creaters[i], &attr, create_files, NULL); if (iret) handle_error("creating the file creators"); } /* create threads which walk from low_wd to high_wd closing all of the wd's in between */ for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_CLOSER_THREADS; j++) { iret = pthread_create ( &closers[i * NUM_CLOSER_THREADS + j], &attr, close_watches, &operator_arg[i]); if (iret) handle_error("creating the close threads"); } /* create threads which just walk from low_wd to low_wd +3 closing wd's for extra races */ for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_ZERO_CLOSERS; j++) { iret = pthread_create ( &zero_closers[i * NUM_ZERO_CLOSERS + j], &attr, zero_close_watches, &operator_arg[i]); if (iret) handle_error("creating the low closer threads"); } /* create threads which just pull data off of the inotify fd. * use default ATTR for larger stack */ for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_DATA_DUMPERS; j++) { iret = pthread_create( &data_dumpers[i * NUM_DATA_DUMPERS + j], NULL, dump_data, &operator_arg[i]); if (iret) handle_error("creating threads to dump inotify data"); } /* Wait till threads are complete before main continues. */ pthread_join(low_wd_reseter, &ret); for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_WATCHER_THREADS; j++) for (k = 0; k < WATCHER_MULTIPLIER; k++) pthread_join(watchers[i * (NUM_WATCHER_THREADS * WATCHER_MULTIPLIER) + (j * WATCHER_MULTIPLIER) + k], &ret); for (i = 0; i < NUM_FILE_CREATERS; i++) pthread_join(file_creaters[i], &ret); for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_CLOSER_THREADS; j++) pthread_join(closers[i * NUM_CLOSER_THREADS + j], &ret); for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_ZERO_CLOSERS; j++) pthread_join(zero_closers[i * NUM_ZERO_CLOSERS + j], &ret); for (i = 0; i < NUM_INOTIFY_INSTANCES; i++) for (j = 0; j < NUM_DATA_DUMPERS; j++) pthread_join(data_dumpers[i * NUM_DATA_DUMPERS + j], &ret); /* clean up the tmp dir which should be empty */ rmdir(TMP_DIR_NAME); exit(0); } void sigfunc(int sig_num) { if (sig_num == SIGINT) stopped = 1; else printf("Got an unknown signal!\n"); } /* constantly create and delete all of the files that are bieng watched */ void *create_files(__attribute__ ((unused)) void *ptr) { char filename[50]; int i; fprintf(stderr, "Starting creator thread\n"); while (!stopped) { for (i = 0; i < NUM_WATCHER_THREADS; i++) { int fd; snprintf(filename, 50, "%s/%d", TMP_DIR_NAME, i); unlink(filename); fd = open(filename, O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); if (fd >= 0) close(fd); } sleep(10); } /* cleanup all files on exit */ for (i = 0; i < NUM_WATCHER_THREADS; i++) { snprintf(filename, 50, "%s/%d", TMP_DIR_NAME, i); unlink(filename); } return NULL; } /* Reset the low_wd so closers can be smart */ void *reset_low_wd(__attribute__ ((unused)) void *ptr) { fprintf(stderr, "Starting low_wd reset thread\n"); while (!stopped) { low_wd = INT_MAX; sleep(1); } return NULL; } /* Pull events off the buffer and ignore them */ void *dump_data(void *ptr) { char buf[8096]; struct operator_struct *operator_arg = ptr; int inotify_fd = operator_arg->inotify_fd; int ret; fprintf(stderr, "Starting inotify data dumper thread\n"); while (!stopped) { ret = read(inotify_fd, buf, 8096); if (ret <= 0) pthread_yield(); } return NULL; } /* add a watch to a specific file as fast as we can */ void *add_watches(void *ptr) { struct watcher_struct *watcher_arg = ptr; int file_num = watcher_arg->file_num; int notify_fd = watcher_arg->inotify_fd; int ret; char filename[50]; fprintf(stderr, "Creating an watch adder thread, notify_fd=%d filenum=%d\n", notify_fd, file_num); snprintf(filename, 50, "%s/%d", TMP_DIR_NAME, file_num); while (!stopped) { ret = inotify_add_watch(notify_fd, filename, IN_ALL_EVENTS); if (ret < 0 && errno != ENOENT) perror("inotify_add_watch"); if (ret > high_wd) high_wd = ret; if (ret < low_wd) low_wd = ret; pthread_yield(); } return NULL; } /* run from low_wd to high_wd closing all watches in between */ void *close_watches(void *ptr) { struct operator_struct *operator_arg = ptr; int inotify_fd = operator_arg->inotify_fd; int i; fprintf(stderr, "Starting a thread to close watchers\n"); while (!stopped) { for (i = low_wd; i < high_wd; i++) inotify_rm_watch(inotify_fd, i); pthread_yield(); } return NULL; } /* run from low_wd to low_wd+3 closing all watch in between just for extra races */ void *zero_close_watches(void *ptr) { struct operator_struct *operator_arg = ptr; int inotify_fd = operator_arg->inotify_fd; int i; while (!stopped) { for (i = low_wd; i <= low_wd+3; i++) inotify_rm_watch(inotify_fd, i); pthread_yield(); } return NULL; } void *mount_tmpdir(__attribute__ ((unused)) void *ptr) { int rc; while (!stopped) { rc = mount(TMP_DIR_NAME, TMP_DIR_NAME, "tmpfs", MS_MGC_VAL, "rootcontext=\"unconfined_u:object_r:tmp_t:s0\""); usleep(100000); if (!rc) umount2(TMP_DIR_NAME, MNT_DETACH); usleep(100000); } return NULL; }