Hello, we have a locking problem in gfs2 that I don't have a proper solution for, so I'm looking for suggestions. What's happening is that a page fault triggers during a read or write operation, while we're holding a glock (the cluster-wide gfs2 inode lock), and the page fault requires another glock. We can recognize and handle the case when both glocks are the same, but when the page fault requires another glock, there is a chance that taking that other glock would deadlock. When we realize that we may not be able to take the other glock in gfs2_fault, we need to communicate that to the read or write operation, which will then drop and re-acquire the "outer" glock and retry. However, there doesn't seem to be a good way to do that; we can only indicate that a page fault should fail by returning VM_FAULT_SIGBUS or similar; that will then be mapped to -EFAULT. We'd need something like VM_FAULT_RESTART that can be mapped to -EBUSY so that we can tell the retry case apart from genuine -EFAULT errors. To show what I mean, below is a proof of concept that adds a restart_hack flag to struct task_struct as a side channel. An even uglier alternative would be to abuse task->journal_info. The obvious response to this email would be "fix your locking order". Well, we can do that by pinning the user pages in gfs2_file_{read,write}_iter before taking the "outer" glock, but we'd ideally only do that when it's actually necessary (i.e., when gfs2_fault has indicated to retry). Thanks for any thoughts. Andreas --- fs/gfs2/file.c | 37 +++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 1 + 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 658fed79d65a..253e720f2df0 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -543,10 +543,15 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf) vm_fault_t ret; int err; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh); if (likely(!recursive)) { err = gfs2_glock_nq(&gh); if (err) { + if (err == GLR_TRYFAILED) { + current->restart_hack = 1; + ret = VM_FAULT_SIGBUS; + goto out_uninit; + } ret = block_page_mkwrite_return(err); goto out_uninit; } @@ -773,12 +778,16 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, return 0; /* skip atime */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +restart: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; + current->restart_hack = 0; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); gfs2_glock_dq(gh); + if (unlikely(ret == -EFAULT) && current->restart_hack) + goto restart; out_uninit: gfs2_holder_uninit(gh); return ret; @@ -803,6 +812,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, * VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); + +restart: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; @@ -811,11 +822,14 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, if (offset + len > i_size_read(&ip->i_inode)) goto out; + current->restart_hack = 0; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); if (ret == -ENOTBLK) ret = 0; out: gfs2_glock_dq(gh); + if (unlikely(ret == -EFAULT) && current->restart_hack) + goto restart; out_uninit: gfs2_holder_uninit(gh); return ret; @@ -834,7 +848,9 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; iocb->ki_flags &= ~IOCB_DIRECT; } +restart1: iocb->ki_flags |= IOCB_NOIO; + current->restart_hack = 0; ret = generic_file_read_iter(iocb, to); iocb->ki_flags &= ~IOCB_NOIO; if (ret >= 0) { @@ -842,6 +858,8 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; written = ret; } else { + if (unlikely(ret == -EFAULT) && current->restart_hack) + goto restart1; if (ret != -EAGAIN) return ret; if (iocb->ki_flags & IOCB_NOWAIT) @@ -849,13 +867,18 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } ip = GFS2_I(iocb->ki_filp->f_mapping->host); gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + +restart2: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; + current->restart_hack = 0; ret = generic_file_read_iter(iocb, to); if (ret > 0) written += ret; gfs2_glock_dq(&gh); + if (unlikely(ret == -EFAULT) && current->restart_hack) + goto restart2; out_uninit: gfs2_holder_uninit(&gh); return written ? written : ret; @@ -912,13 +935,18 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out_unlock; iocb->ki_flags |= IOCB_DSYNC; +restart1: + current->restart_hack = 0; current->backing_dev_info = inode_to_bdi(inode); buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); current->backing_dev_info = NULL; if (unlikely(buffered <= 0)) { + if (unlikely(buffered == -EFAULT) && current->restart_hack) + goto restart1; if (!ret) ret = buffered; goto out_unlock; + } /* * We need to ensure that the page cache pages are written to @@ -935,10 +963,15 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { +restart2: + current->restart_hack = 0; current->backing_dev_info = inode_to_bdi(inode); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); current->backing_dev_info = NULL; - if (likely(ret > 0)) { + if (unlikely(ret <= 0)) { + if (unlikely(ret == -EFAULT) && current->restart_hack) + goto restart2; + } else { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); } diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..de053ac8d8d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,6 +1067,7 @@ struct task_struct { /* Journalling filesystem info: */ void *journal_info; + unsigned restart_hack:1; /* Stacked block device info: */ struct bio_list *bio_list; -- 2.26.3