On Wed, Jul 14, 2021 at 01:19:54PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > The running of a forced shutdown is a bit of a mess. It does racy > checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then > does more racy checks in xfs_log_force_unmount() before finally > setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the > log->icloglock. > > Move the checking and setting of XFS_MOUNT_SHUTDOWN into > xfs_do_force_shutdown() so we only process a shutdown once and once > only. Serialise this with the mp->m_sb_lock spinlock so that the > state change is atomic and won't race. Move all the mount specific > shutdown state changes from xfs_log_force_unmount() to > xfs_do_force_shutdown() so they are done atomically with setting > XFS_MOUNT_SHUTDOWN. > > Then get rid of the racy xlog_is_shutdown() check from > xlog_force_shutdown(), and gate the log shutdown on the > test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This > means that the log is shutdown once and once only, and code that > needs to prevent races with shutdown can do so by holding the > icloglock and checking the return value of xlog_is_shutdown(). > > This results in a predicable shutdown execution process - we set the s/predicable/predictable/, same as last time... ...but with that fixed, Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx> --D > shutdown flags once and process the shutdown once rather than the > current "as many concurrent shutdowns as can race to the flag > setting" situation we have now. > > Also, now that shutdown is atomic, alway emit a stack trace when the > error level for the filesystem is high enough. This means that we > always get a stack trace when trying to diagnose the cause of > shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE > cases. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/xfs_fsops.c | 63 ++++++++++++++-------------- > fs/xfs/xfs_log.c | 100 ++++++++++++++++++++------------------------- > fs/xfs/xfs_log.h | 2 +- > 3 files changed, 76 insertions(+), 89 deletions(-) > > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c > index 6ed29b158312..7a2f4feacc35 100644 > --- a/fs/xfs/xfs_fsops.c > +++ b/fs/xfs/xfs_fsops.c > @@ -511,6 +511,11 @@ xfs_fs_goingdown( > * consistent. We don't do an unmount here; just shutdown the shop, make sure > * that absolutely nothing persistent happens to this filesystem after this > * point. > + * > + * The shutdown state change is atomic, resulting in the first and only the > + * first shutdown call processing the shutdown. This means we only shutdown the > + * log once as it requires, and we don't spam the logs when multiple concurrent > + * shutdowns race to set the shutdown flags. > */ > void > xfs_do_force_shutdown( > @@ -519,48 +524,40 @@ xfs_do_force_shutdown( > char *fname, > int lnnum) > { > - bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; > + int tag; > + const char *why; > > - /* > - * No need to duplicate efforts. > - */ > - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) > - return; > - > - /* > - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't > - * queue up anybody new on the log reservations, and wakes up > - * everybody who's sleeping on log reservations to tell them > - * the bad news. > - */ > - if (xfs_log_force_umount(mp, logerror)) > - return; > - > - if (flags & SHUTDOWN_FORCE_UMOUNT) { > - xfs_alert(mp, > -"User initiated shutdown (0x%x) received. Shutting down filesystem", > - flags); > + spin_lock(&mp->m_sb_lock); > + if (XFS_FORCED_SHUTDOWN(mp)) { > + spin_unlock(&mp->m_sb_lock); > return; > } > + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; > + if (mp->m_sb_bp) > + mp->m_sb_bp->b_flags |= XBF_DONE; > + spin_unlock(&mp->m_sb_lock); > + > + if (flags & SHUTDOWN_FORCE_UMOUNT) > + xfs_alert(mp, "User initiated shutdown received."); > > - if (flags & SHUTDOWN_CORRUPT_INCORE) { > - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, > -"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem", > - flags, __return_address, fname, lnnum); > - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) > - xfs_stack_trace(); > - } else if (logerror) { > - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, > -"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", > - flags, __return_address, fname, lnnum); > + if (xlog_force_shutdown(mp->m_log, flags)) { > + tag = XFS_PTAG_SHUTDOWN_LOGERROR; > + why = "Log I/O Error"; > + } else if (flags & SHUTDOWN_CORRUPT_INCORE) { > + tag = XFS_PTAG_SHUTDOWN_CORRUPT; > + why = "Corruption of in-memory data"; > } else { > - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, > -"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", > - flags, __return_address, fname, lnnum); > + tag = XFS_PTAG_SHUTDOWN_IOERROR; > + why = "Metadata I/O Error"; > } > > + xfs_alert_tag(mp, tag, > +"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", > + why, flags, __return_address, fname, lnnum); > xfs_alert(mp, > "Please unmount the filesystem and rectify the problem(s)"); > + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) > + xfs_stack_trace(); > } > > /* > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index 54fff0d41fd0..f996f51c6cee 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -3673,76 +3673,66 @@ xlog_verify_iclog( > #endif > > /* > - * This is called from xfs_force_shutdown, when we're forcibly > - * shutting down the filesystem, typically because of an IO error. > - * Our main objectives here are to make sure that: > - * a. if !logerror, flush the logs to disk. Anything modified > - * after this is ignored. > - * b. the filesystem gets marked 'SHUTDOWN' for all interested > - * parties to find out, 'atomically'. > - * c. those who're sleeping on log reservations, pinned objects and > - * other resources get woken up, and be told the bad news. > - * d. nothing new gets queued up after (b) and (c) are done. > + * Perform a forced shutdown on the log. This should be called once and once > + * only by the high level filesystem shutdown code to shut the log subsystem > + * down cleanly. > * > - * Note: for the !logerror case we need to flush the regions held in memory out > - * to disk first. This needs to be done before the log is marked as shutdown, > - * otherwise the iclog writes will fail. > + * Our main objectives here are to make sure that: > + * a. if the shutdown was not due to a log IO error, flush the logs to > + * disk. Anything modified after this is ignored. > + * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested > + * parties to find out. Nothing new gets queued after this is done. > + * c. Tasks sleeping on log reservations, pinned objects and > + * other resources get woken up. > * > - * Return non-zero if log shutdown transition had already happened. > + * Return true if the shutdown cause was a log IO error and we actually shut the > + * log down. > */ > -int > -xfs_log_force_umount( > - struct xfs_mount *mp, > - int logerror) > +bool > +xlog_force_shutdown( > + struct xlog *log, > + int shutdown_flags) > { > - struct xlog *log; > - int retval = 0; > - > - log = mp->m_log; > + bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); > > /* > - * If this happens during log recovery, don't worry about > - * locking; the log isn't open for business yet. > + * If this happens during log recovery then we aren't using the runtime > + * log mechanisms yet so there's nothing to shut down. > */ > - if (!log || xlog_in_recovery(log)) { > - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; > - if (mp->m_sb_bp) > - mp->m_sb_bp->b_flags |= XBF_DONE; > - return 0; > - } > + if (!log || xlog_in_recovery(log)) > + return false; > > - /* > - * Somebody could've already done the hard work for us. > - * No need to get locks for this. > - */ > - if (logerror && xlog_is_shutdown(log)) > - return 1; > + ASSERT(!xlog_is_shutdown(log)); > > /* > * Flush all the completed transactions to disk before marking the log > - * being shut down. We need to do it in this order to ensure that > - * completed operations are safely on disk before we shut down, and that > - * we don't have to issue any buffer IO after the shutdown flags are set > - * to guarantee this. > + * being shut down. We need to do this first as shutting down the log > + * before the force will prevent the log force from flushing the iclogs > + * to disk. > + * > + * Re-entry due to a log IO error shutdown during the log force is > + * prevented by the atomicity of higher level shutdown code. > */ > - if (!logerror) > - xfs_log_force(mp, XFS_LOG_SYNC); > + if (!log_error) > + xfs_log_force(log->l_mp, XFS_LOG_SYNC); > > /* > - * mark the filesystem and the as in a shutdown state and wake > - * everybody up to tell them the bad news. > + * Atomically set the shutdown state. If the shutdown state is already > + * set, there someone else is performing the shutdown and so we are done > + * here. This should never happen because we should only ever get called > + * once by the first shutdown caller. > + * > + * Much of the log state machine transitions assume that shutdown state > + * cannot change once they hold the log->l_icloglock. Hence we need to > + * hold that lock here, even though we use the atomic test_and_set_bit() > + * operation to set the shutdown state. > */ > spin_lock(&log->l_icloglock); > - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; > - if (mp->m_sb_bp) > - mp->m_sb_bp->b_flags |= XBF_DONE; > - > - /* > - * Mark the log and the iclogs with IO error flags to prevent any > - * further log IO from being issued or completed. > - */ > - if (!test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) > - retval = 1; > + if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { > + spin_unlock(&log->l_icloglock); > + ASSERT(0); > + return false; > + } > spin_unlock(&log->l_icloglock); > > /* > @@ -3766,7 +3756,7 @@ xfs_log_force_umount( > spin_unlock(&log->l_cilp->xc_push_lock); > xlog_state_do_callback(log); > > - return retval; > + return log_error; > } > > STATIC int > diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h > index 4c5c8a7db1d9..3f680f0c9744 100644 > --- a/fs/xfs/xfs_log.h > +++ b/fs/xfs/xfs_log.h > @@ -125,7 +125,6 @@ int xfs_log_reserve(struct xfs_mount *mp, > bool permanent); > int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); > void xfs_log_unmount(struct xfs_mount *mp); > -int xfs_log_force_umount(struct xfs_mount *mp, int logerror); > bool xfs_log_writable(struct xfs_mount *mp); > > struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); > @@ -140,5 +139,6 @@ void xfs_log_clean(struct xfs_mount *mp); > bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); > > xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); > +bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); > > #endif /* __XFS_LOG_H__ */ > -- > 2.31.1 >