On Fri, Aug 15, 2014 at 04:39:06PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > There is a lot of cookie-cutter code that looks like: > > if (shutdown) > handle buffer error > xfs_buf_iorequest(bp) > error = xfs_buf_iowait(bp) > if (error) > handle buffer error > > spread through XFS. There's significant complexity now in > xfs_buf_iorequest() to specifically handle this sort of synchronous > IO pattern, but there's all sorts of nasty surprises in different > error handling code dependent on who owns the buffer references and > the locks. > > Pull this pattern into a single helper, where we can hide all the > synchronous IO warts and hence make the error handling for all the > callers much saner. This removes the need for a special extra > reference to protect IO completion processing, as we can now hold a > single reference across dispatch and waiting, simplifying the sync > IO smeantics and error handling. > > In doing this, also rename xfs_buf_iorequest to xfs_buf_submit and > make it explicitly handle on asynchronous IO. This forces all users > to be switched specifically to one interface or the other and > removes any ambiguity between how the interfaces are to be used. It > also means that xfs_buf_iowait() goes away. > > For the special case of delwri buffer submission and waiting, we > don't need to issue IO synchronously at all. The second pass to cal > xfs_buf_iowait() can now just block on xfs_buf_lock() - the buffer > will be unlocked when the async IO is complete. This formalises a > sane the method of waiting for async IO - take an extra reference, > submit the IO, call xfs_buf_lock() when you want to wait for IO > completion. i.e.: > > bp = xfs_buf_find(); > xfs_buf_hold(bp); > bp->b_flags |= XBF_ASYNC; > xfs_buf_iosubmit(bp); > xfs_buf_lock(bp) > error = bp->b_error; > .... > xfs_buf_relse(bp); > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/xfs_bmap_util.c | 14 +--- > fs/xfs/xfs_buf.c | 186 ++++++++++++++++++++++++++--------------------- > fs/xfs/xfs_buf.h | 4 +- > fs/xfs/xfs_buf_item.c | 4 +- > fs/xfs/xfs_log.c | 2 +- > fs/xfs/xfs_log_recover.c | 22 ++---- > fs/xfs/xfs_trans_buf.c | 17 ++--- > 7 files changed, 122 insertions(+), 127 deletions(-) > > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c > index 2f1e30d..c53cc03 100644 > --- a/fs/xfs/xfs_bmap_util.c > +++ b/fs/xfs/xfs_bmap_util.c > @@ -1157,12 +1157,7 @@ xfs_zero_remaining_bytes( > XFS_BUF_READ(bp); > XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); > > - if (XFS_FORCED_SHUTDOWN(mp)) { > - error = -EIO; > - break; > - } > - xfs_buf_iorequest(bp); > - error = xfs_buf_iowait(bp); > + error = xfs_buf_submit_wait(bp); > if (error) { > xfs_buf_ioerror_alert(bp, > "xfs_zero_remaining_bytes(read)"); > @@ -1175,12 +1170,7 @@ xfs_zero_remaining_bytes( > XFS_BUF_UNREAD(bp); > XFS_BUF_WRITE(bp); > > - if (XFS_FORCED_SHUTDOWN(mp)) { > - error = -EIO; > - break; > - } > - xfs_buf_iorequest(bp); > - error = xfs_buf_iowait(bp); > + error = xfs_buf_submit_wait(bp); > if (error) { > xfs_buf_ioerror_alert(bp, > "xfs_zero_remaining_bytes(write)"); > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index 352e9219..a2599f9 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -623,10 +623,11 @@ _xfs_buf_read( > bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); > bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); > > - xfs_buf_iorequest(bp); > - if (flags & XBF_ASYNC) > + if (flags & XBF_ASYNC) { > + xfs_buf_submit(bp); > return 0; > - return xfs_buf_iowait(bp); > + } > + return xfs_buf_submit_wait(bp); > } > > xfs_buf_t * > @@ -708,12 +709,7 @@ xfs_buf_read_uncached( > bp->b_flags |= XBF_READ; > bp->b_ops = ops; > > - if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { > - xfs_buf_relse(bp); > - return NULL; > - } > - xfs_buf_iorequest(bp); > - xfs_buf_iowait(bp); > + xfs_buf_submit_wait(bp); > return bp; > } > > @@ -1027,10 +1023,8 @@ xfs_buf_ioend( > (*(bp->b_iodone))(bp); > else if (bp->b_flags & XBF_ASYNC) > xfs_buf_relse(bp); > - else { > + else > complete(&bp->b_iowait); > - xfs_buf_rele(bp); > - } > } > > static void > @@ -1083,21 +1077,7 @@ xfs_bwrite( > bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | > XBF_WRITE_FAIL | XBF_DONE); > > - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { > - trace_xfs_bdstrat_shut(bp, _RET_IP_); > - > - xfs_buf_ioerror(bp, -EIO); > - xfs_buf_stale(bp); > - > - /* sync IO, xfs_buf_ioend is going to remove a ref here */ > - xfs_buf_hold(bp); > - xfs_buf_ioend(bp); > - return -EIO; > - } > - > - xfs_buf_iorequest(bp); > - > - error = xfs_buf_iowait(bp); > + error = xfs_buf_submit_wait(bp); > if (error) { > xfs_force_shutdown(bp->b_target->bt_mount, > SHUTDOWN_META_IO_ERROR); > @@ -1206,7 +1186,7 @@ next_chunk: > } else { > /* > * This is guaranteed not to be the last io reference count > - * because the caller (xfs_buf_iorequest) holds a count itself. > + * because the caller (xfs_buf_submit) holds a count itself. > */ > atomic_dec(&bp->b_io_remaining); > xfs_buf_ioerror(bp, -EIO); > @@ -1296,13 +1276,29 @@ _xfs_buf_ioapply( > blk_finish_plug(&plug); > } > > +/* > + * Asynchronous IO submission path. This transfers the buffer lock ownership and > + * the current reference to the IO. It is not safe to reference the buffer after > + * a call to this function unless the caller holds an additional reference > + * itself. > + */ > void > -xfs_buf_iorequest( > - xfs_buf_t *bp) > +xfs_buf_submit( > + struct xfs_buf *bp) > { > trace_xfs_buf_iorequest(bp, _RET_IP_); > > ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); > + ASSERT(bp->b_flags & XBF_ASYNC); > + > + /* on shutdown we stale and complete the buffer immediately */ > + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { > + xfs_buf_ioerror(bp, -EIO); > + bp->b_flags &= ~XBF_DONE; > + xfs_buf_stale(bp); > + xfs_buf_ioend(bp); > + return; > + } > > if (bp->b_flags & XBF_WRITE) > xfs_buf_wait_unpin(bp); > @@ -1311,25 +1307,10 @@ xfs_buf_iorequest( > bp->b_io_error = 0; > I know this is from the previous patch, but I wonder if it's cleaner to reset b_io_error when the error is transferred to b_error. That seems slightly more future proof at least. > /* > - * Take references to the buffer. For XBF_ASYNC buffers, holding a > - * reference for as long as submission takes is all that is necessary > - * here. The IO inherits the lock and hold count from the submitter, > - * and these are release during IO completion processing. Taking a hold > - * over submission ensures that the buffer is not freed until we have > - * completed all processing, regardless of when IO errors occur or are > - * reported. > - * > - * However, for synchronous IO, the IO does not inherit the submitters > - * reference count, nor the buffer lock. Hence we need to take an extra > - * reference to the buffer for the for the IO context so that we can > - * guarantee the buffer is not freed until all IO completion processing > - * is done. Otherwise the caller can drop their reference while the IO > - * is still in progress and hence trigger a use-after-free situation. > + * Take an extra reference so that we don't have to guess when it's no > + * longer safe to reference the buffer that was passed to us. > */ Assuming my understanding is correct: /* * The caller's reference is released by buffer I/O completion. Technically * this should not occur until we release the last b_io_remaining reference. * Take a direct reference across the I/O submission anyways to be sure it's * safe to access the buffer until we release it. */ > xfs_buf_hold(bp); > - if (!(bp->b_flags & XBF_ASYNC)) > - xfs_buf_hold(bp); > - > > /* > * Set the count to 1 initially, this will stop an I/O completion > @@ -1340,14 +1321,13 @@ xfs_buf_iorequest( > _xfs_buf_ioapply(bp); > > /* > - * If _xfs_buf_ioapply failed or we are doing synchronous IO that > - * completes extremely quickly, we can get back here with only the IO > + * If _xfs_buf_ioapply failed, > + * we can get back here with only the IO > * reference we took above. _xfs_buf_ioend will drop it to zero, so > * we'd better run completion processing synchronously so that the we > - * don't return to the caller with completion still pending. In the > - * error case, this allows the caller to check b_error safely without > - * waiting, and in the synchronous IO case it avoids unnecessary context > - * switches an latency for high-peformance devices. > + * don't return to the caller with completion still pending. > + * this allows the caller to check b_error safely without > + * waiting > */ > if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { > if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) > @@ -1357,25 +1337,70 @@ xfs_buf_iorequest( > } > > xfs_buf_rele(bp); > + /* Note: it is not safe to reference bp now we've dropped our ref */ > } > > /* > - * Waits for I/O to complete on the buffer supplied. It returns immediately if > - * no I/O is pending or there is already a pending error on the buffer, in which > - * case nothing will ever complete. It returns the I/O error code, if any, or > - * 0 if there was no error. > + * Synchronous buffer IO submission path, read or write. > */ > int > -xfs_buf_iowait( > - xfs_buf_t *bp) > +xfs_buf_submit_wait( > + struct xfs_buf *bp) > { > - trace_xfs_buf_iowait(bp, _RET_IP_); > + int error; > + > + trace_xfs_buf_iorequest(bp, _RET_IP_); > > - if (!bp->b_error) > - wait_for_completion(&bp->b_iowait); > + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); > + > + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { > + xfs_buf_ioerror(bp, -EIO); > + xfs_buf_stale(bp); > + bp->b_flags &= ~XBF_DONE; > + return -EIO; > + } > > + if (bp->b_flags & XBF_WRITE) > + xfs_buf_wait_unpin(bp); > + > + /* clear the internal error state to avoid spurious errors */ > + bp->b_io_error = 0; > + > + /* > + * For synchronous IO, the IO does not inherit the submitters reference > + * count, nor the buffer lock. Hence we cannot release the reference we > + * are about to take until we've waited for all IO completion to occur, > + * including any xfs_buf_ioend_async() work that may be pending. > + */ > + xfs_buf_hold(bp); > + Harmless, but why is this necessary? The caller should have the reference, the I/O completion won't release it and we wait on b_iowait before we return. Isn't the caller's reference sufficient? > + /* > + * Set the count to 1 initially, this will stop an I/O completion > + * callout which happens before we have started all the I/O from calling > + * xfs_buf_ioend too early. > + */ > + atomic_set(&bp->b_io_remaining, 1); > + _xfs_buf_ioapply(bp); > + > + /* > + * make sure we run completion synchronously if it raced with us and is > + * already complete. > + */ > + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) > + xfs_buf_ioend(bp); > + > + /* wait for completion before gathering the error from the buffer */ > + trace_xfs_buf_iowait(bp, _RET_IP_); > + wait_for_completion(&bp->b_iowait); > trace_xfs_buf_iowait_done(bp, _RET_IP_); > - return bp->b_error; > + error = bp->b_error; > + > + /* > + * all done now, we can release the hold that keeps the buffer > + * referenced for the entire IO. > + */ > + xfs_buf_rele(bp); > + return error; > } > > xfs_caddr_t > @@ -1769,29 +1794,19 @@ __xfs_buf_delwri_submit( > blk_start_plug(&plug); > list_for_each_entry_safe(bp, n, io_list, b_list) { > bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); > - bp->b_flags |= XBF_WRITE; > + bp->b_flags |= XBF_WRITE | XBF_ASYNC; > > - if (!wait) { > - bp->b_flags |= XBF_ASYNC; > + /* > + * we do all Io submission async. This means if we need to wait > + * for IO completion we need to take an extra reference so the > + * buffer is still valid on the other side. > + */ > + if (!wait) > list_del_init(&bp->b_list); > - } > - > - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { > - trace_xfs_bdstrat_shut(bp, _RET_IP_); > - > - xfs_buf_ioerror(bp, -EIO); > - xfs_buf_stale(bp); > + else > + xfs_buf_hold(bp); > > - /* > - * if sync IO, xfs_buf_ioend is going to remove a > - * ref here. We need to add that so we can collect > - * all the buffer errors in the wait loop. > - */ > - if (wait) > - xfs_buf_hold(bp); > - xfs_buf_ioend(bp); > - } else > - xfs_buf_iorequest(bp); > + xfs_buf_submit(bp); > } > blk_finish_plug(&plug); > > @@ -1838,7 +1853,10 @@ xfs_buf_delwri_submit( > bp = list_first_entry(&io_list, struct xfs_buf, b_list); > > list_del_init(&bp->b_list); > - error2 = xfs_buf_iowait(bp); > + > + /* locking the buffer will wait for async IO completion. */ > + xfs_buf_lock(bp); > + error2 = bp->b_error; Interesting delwri cleanup overall. The lock here works for synchronization (blocking), but it doesn't look safe for error processing. Once the buffer lock is dropped, who says b_error is from our write (and not a subsequent, for example) and thus this caller's responsibility to handle the error? I suspect this is a reason the lock is typically held across a sync I/O, so the error is valid after the I/O. Also, it looks like blocking on async I/O as such opens up the possibility of blocking indefinitely on failing writes if the right combination of delwri and b_iodone handler is used (see xfs_buf_iodone_callbacks()). I'm not sure if that's a real problem today, though. > xfs_buf_relse(bp); > if (!error) > error = error2; > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h > index d8f57f6..0acfc30 100644 > --- a/fs/xfs/xfs_buf.h > +++ b/fs/xfs/xfs_buf.h > @@ -290,8 +290,8 @@ extern int xfs_bwrite(struct xfs_buf *bp); > extern void xfs_buf_ioend(struct xfs_buf *bp); > extern void xfs_buf_ioerror(xfs_buf_t *, int); > extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); > -extern void xfs_buf_iorequest(xfs_buf_t *); > -extern int xfs_buf_iowait(xfs_buf_t *); > +extern void xfs_buf_submit(struct xfs_buf *bp); > +extern int xfs_buf_submit_wait(struct xfs_buf *bp); > extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, > xfs_buf_rw_t); > #define xfs_buf_zero(bp, off, len) \ > diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c > index 4fd41b5..cbea909 100644 > --- a/fs/xfs/xfs_buf_item.c > +++ b/fs/xfs/xfs_buf_item.c > @@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks( > * a way to shut the filesystem down if the writes keep failing. > * > * In practice we'll shut the filesystem down soon as non-transient > - * erorrs tend to affect the whole device and a failing log write > + * errors tend to affect the whole device and a failing log write > * will make us give up. But we really ought to do better here. > */ > if (XFS_BUF_ISASYNC(bp)) { > @@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks( > if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { > bp->b_flags |= XBF_WRITE | XBF_ASYNC | > XBF_DONE | XBF_WRITE_FAIL; > - xfs_buf_iorequest(bp); > + xfs_buf_submit(bp); > } else { > xfs_buf_relse(bp); > } > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index e4665db..c4d7f79 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -1699,7 +1699,7 @@ xlog_bdstrat( > return 0; > } > > - xfs_buf_iorequest(bp); > + xfs_buf_submit(bp); > return 0; > } > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index 4ba19bf..1e14452 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -193,12 +193,8 @@ xlog_bread_noalign( > bp->b_io_length = nbblks; > bp->b_error = 0; > > - if (XFS_FORCED_SHUTDOWN(log->l_mp)) > - return -EIO; > - > - xfs_buf_iorequest(bp); > - error = xfs_buf_iowait(bp); > - if (error) > + error = xfs_buf_submit_wait(bp); > + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) > xfs_buf_ioerror_alert(bp, __func__); > return error; > } > @@ -4427,16 +4423,12 @@ xlog_do_recover( > XFS_BUF_UNASYNC(bp); > bp->b_ops = &xfs_sb_buf_ops; > > - if (XFS_FORCED_SHUTDOWN(log->l_mp)) { > - xfs_buf_relse(bp); > - return -EIO; > - } > - > - xfs_buf_iorequest(bp); > - error = xfs_buf_iowait(bp); > + error = xfs_buf_submit_wait(bp); > if (error) { > - xfs_buf_ioerror_alert(bp, __func__); > - ASSERT(0); > + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { Should this be !XFS_FORCED_SHUTDOWN()? Brian > + xfs_buf_ioerror_alert(bp, __func__); > + ASSERT(0); > + } > xfs_buf_relse(bp); > return error; > } > diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c > index 9c3e610..4bdf02c 100644 > --- a/fs/xfs/xfs_trans_buf.c > +++ b/fs/xfs/xfs_trans_buf.c > @@ -286,18 +286,13 @@ xfs_trans_read_buf_map( > bp->b_flags |= XBF_READ; > bp->b_ops = ops; > > - if (XFS_FORCED_SHUTDOWN(mp)) { > - trace_xfs_bdstrat_shut(bp, _RET_IP_); > - error = -EIO; > - goto out_stale; > - } > - > - xfs_buf_iorequest(bp); > - error = xfs_buf_iowait(bp); > + error = xfs_buf_submit_wait(bp); > if (error) { > - xfs_buf_ioerror_alert(bp, __func__); > - goto out_do_shutdown; > - > + if (!XFS_FORCED_SHUTDOWN(mp)) { > + xfs_buf_ioerror_alert(bp, __func__); > + goto out_do_shutdown; > + } > + goto out_stale; > } > } > > -- > 2.0.0 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs