Tested the patch with testcases 234 and 273. They ran for more than 350 iterations without getting into the hang situation. Tested-by: Chandra Seetharaman <sekharan@xxxxxxxxxx> Few generic comments on the patch 1. xlog_*_wake could use something to indicate that they are looking for log space in the specific queue. ex: xlog_reserveq_available() 2. All new functions expect a lock to be held on entry. Can be explicitly specified in a comment. 3. Change the trace function names to reflect the names of the function ?! Otherwise patch looks good and works fine. Chandra ------- end of comments ------ On Mon, 2011-11-28 at 03:17 -0500, Christoph Hellwig wrote: > plain text document attachment (xfs-fix-log-race) > Apply the scheme used in log_regrant_write_log_space to wake up any other > threads waiting for log space before the newly added one to > log_regrant_write_log_space as well, and factor the code into readable > helpers. For each of the queues we have add two helpers: > > - one to try to wake up all waiting threads. This helper will also be > usable by xfs_log_move_tail once we remove the current opportunistic > wakeups in it. > - one to sleep on t_wait until enough log space is available, loosely > modelled after Linux waitqueues. > > And use them to reimplement the guts of log_regrant_write_log_space and > log_regrant_write_log_space. These two function now use one and the same > algorithm for waiting on log space instead of subtly different ones before, > with an option to completely unify them in the near future. > > Also move the filesystem shutdown handling to the common caller given > that we had to touch it anyway. > > Based on hard debugging and an earlier patch from > Chandra Seetharaman <sekharan@xxxxxxxxxx>. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > > --- > fs/xfs/xfs_log.c | 348 ++++++++++++++++++++++++++--------------------------- > fs/xfs/xfs_trace.h | 12 - > 2 files changed, 177 insertions(+), 183 deletions(-) > > Index: xfs/fs/xfs/xfs_log.c > =================================================================== > --- xfs.orig/fs/xfs/xfs_log.c 2011-11-20 16:29:49.356194023 +0100 > +++ xfs/fs/xfs/xfs_log.c 2011-11-20 16:46:33.440754431 +0100 > @@ -150,6 +150,117 @@ xlog_grant_add_space( > } while (head_val != old); > } > > +STATIC bool > +xlog_reserveq_wake( > + struct log *log, > + int *free_bytes) > +{ > + struct xlog_ticket *tic; > + int need_bytes; > + > + list_for_each_entry(tic, &log->l_reserveq, t_queue) { > + if (tic->t_flags & XLOG_TIC_PERM_RESERV) > + need_bytes = tic->t_unit_res * tic->t_cnt; > + else > + need_bytes = tic->t_unit_res; > + > + if (*free_bytes < need_bytes) > + return false; > + *free_bytes -= need_bytes; > + > + trace_xfs_log_grant_wake_up(log, tic); > + wake_up(&tic->t_wait); > + } > + > + return true; > +} > + > +STATIC bool > +xlog_writeq_wake( > + struct log *log, > + int *free_bytes) > +{ > + struct xlog_ticket *tic; > + int need_bytes; > + > + list_for_each_entry(tic, &log->l_writeq, t_queue) { > + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); > + > + need_bytes = tic->t_unit_res; > + > + if (*free_bytes < need_bytes) > + return false; > + *free_bytes -= need_bytes; > + > + trace_xfs_log_regrant_write_wake_up(log, tic); > + wake_up(&tic->t_wait); > + } > + > + return true; > +} > + > +STATIC int > +xlog_reserveq_wait( > + struct log *log, > + struct xlog_ticket *tic, > + int need_bytes) > +{ > + list_add_tail(&tic->t_queue, &log->l_reserveq); > + > + do { > + if (XLOG_FORCED_SHUTDOWN(log)) > + goto shutdown; > + xlog_grant_push_ail(log, need_bytes); > + > + XFS_STATS_INC(xs_sleep_logspace); > + trace_xfs_log_grant_sleep(log, tic); > + > + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); > + trace_xfs_log_grant_wake(log, tic); > + > + spin_lock(&log->l_grant_reserve_lock); > + if (XLOG_FORCED_SHUTDOWN(log)) > + goto shutdown; > + } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); > + > + list_del_init(&tic->t_queue); > + return 0; > +shutdown: > + list_del_init(&tic->t_queue); > + return XFS_ERROR(EIO); > +} > + > +STATIC int > +xlog_writeq_wait( > + struct log *log, > + struct xlog_ticket *tic, > + int need_bytes) > +{ > + list_add_tail(&tic->t_queue, &log->l_writeq); > + > + do { > + if (XLOG_FORCED_SHUTDOWN(log)) > + goto shutdown; > + xlog_grant_push_ail(log, need_bytes); > + > + XFS_STATS_INC(xs_sleep_logspace); > + trace_xfs_log_regrant_write_sleep(log, tic); > + > + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); > + trace_xfs_log_regrant_write_wake(log, tic); > + > + spin_lock(&log->l_grant_write_lock); > + if (XLOG_FORCED_SHUTDOWN(log)) > + goto shutdown; > + } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); > + > + list_del_init(&tic->t_queue); > + return 0; > +shutdown: > + list_del_init(&tic->t_queue); > + return XFS_ERROR(EIO); > +} > + > static void > xlog_tic_reset_res(xlog_ticket_t *tic) > { > @@ -350,8 +461,19 @@ xfs_log_reserve( > retval = xlog_grant_log_space(log, internal_ticket); > } > > + if (unlikely(retval)) { > + /* > + * If we are failing, make sure the ticket doesn't have any > + * current reservations. We don't want to add this back > + * when the ticket/ transaction gets cancelled. > + */ > + internal_ticket->t_curr_res = 0; > + /* ungrant will give back unit_res * t_cnt. */ > + internal_ticket->t_cnt = 0; > + } > + > return retval; > -} /* xfs_log_reserve */ > +} > > > /* > @@ -2481,8 +2603,8 @@ restart: > /* > * Atomically get the log space required for a log ticket. > * > - * Once a ticket gets put onto the reserveq, it will only return after > - * the needed reservation is satisfied. > + * Once a ticket gets put onto the reserveq, it will only return after the > + * needed reservation is satisfied. > * > * This function is structured so that it has a lock free fast path. This is > * necessary because every new transaction reservation will come through this > @@ -2490,113 +2612,53 @@ restart: > * every pass. > * > * As tickets are only ever moved on and off the reserveq under the > - * l_grant_reserve_lock, we only need to take that lock if we are going > - * to add the ticket to the queue and sleep. We can avoid taking the lock if the > - * ticket was never added to the reserveq because the t_queue list head will be > - * empty and we hold the only reference to it so it can safely be checked > - * unlocked. > + * l_grant_reserve_lock, we only need to take that lock if we are going to add > + * the ticket to the queue and sleep. We can avoid taking the lock if the ticket > + * was never added to the reserveq because the t_queue list head will be empty > + * and we hold the only reference to it so it can safely be checked unlocked. > */ > STATIC int > -xlog_grant_log_space(xlog_t *log, > - xlog_ticket_t *tic) > +xlog_grant_log_space( > + struct log *log, > + struct xlog_ticket *tic) > { > - int free_bytes; > - int need_bytes; > + int free_bytes, need_bytes; > + int error = 0; > > -#ifdef DEBUG > - if (log->l_flags & XLOG_ACTIVE_RECOVERY) > - panic("grant Recovery problem"); > -#endif > + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); > > trace_xfs_log_grant_enter(log, tic); > > + /* > + * If there are other waiters on the queue then give them a chance at > + * logspace before us. Wake up the first waiters, if we do not wake > + * up all the waiters then go to sleep waiting for more free space, > + * otherwise try to get some space for this transaction. > + */ > need_bytes = tic->t_unit_res; > if (tic->t_flags & XFS_LOG_PERM_RESERV) > need_bytes *= tic->t_ocnt; > - > - /* something is already sleeping; insert new transaction at end */ > - if (!list_empty_careful(&log->l_reserveq)) { > - spin_lock(&log->l_grant_reserve_lock); > - /* recheck the queue now we are locked */ > - if (list_empty(&log->l_reserveq)) { > - spin_unlock(&log->l_grant_reserve_lock); > - goto redo; > - } > - list_add_tail(&tic->t_queue, &log->l_reserveq); > - > - trace_xfs_log_grant_sleep1(log, tic); > - > - /* > - * Gotta check this before going to sleep, while we're > - * holding the grant lock. > - */ > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return; > - > - XFS_STATS_INC(xs_sleep_logspace); > - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); > - > - /* > - * If we got an error, and the filesystem is shutting down, > - * we'll catch it down below. So just continue... > - */ > - trace_xfs_log_grant_wake1(log, tic); > - } > - > -redo: > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return_unlocked; > - > free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); > - if (free_bytes < need_bytes) { > + if (!list_empty_careful(&log->l_reserveq)) { > spin_lock(&log->l_grant_reserve_lock); > - if (list_empty(&tic->t_queue)) > - list_add_tail(&tic->t_queue, &log->l_reserveq); > - > - trace_xfs_log_grant_sleep2(log, tic); > - > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return; > - > - xlog_grant_push_ail(log, need_bytes); > - > - XFS_STATS_INC(xs_sleep_logspace); > - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); > - > - trace_xfs_log_grant_wake2(log, tic); > - goto redo; > - } > - > - if (!list_empty(&tic->t_queue)) { > + if (!xlog_reserveq_wake(log, &free_bytes) || > + free_bytes < need_bytes) > + error = xlog_reserveq_wait(log, tic, need_bytes); > + spin_unlock(&log->l_grant_reserve_lock); > + } else if (free_bytes < need_bytes) { > spin_lock(&log->l_grant_reserve_lock); > - list_del_init(&tic->t_queue); > + error = xlog_reserveq_wait(log, tic, need_bytes); > spin_unlock(&log->l_grant_reserve_lock); > } > + if (error) > + return error; > > - /* we've got enough space */ > xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); > xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); > trace_xfs_log_grant_exit(log, tic); > xlog_verify_grant_tail(log); > return 0; > - > -error_return_unlocked: > - spin_lock(&log->l_grant_reserve_lock); > -error_return: > - list_del_init(&tic->t_queue); > - spin_unlock(&log->l_grant_reserve_lock); > - trace_xfs_log_grant_error(log, tic); > - > - /* > - * If we are failing, make sure the ticket doesn't have any > - * current reservations. We don't want to add this back when > - * the ticket/transaction gets cancelled. > - */ > - tic->t_curr_res = 0; > - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ > - return XFS_ERROR(EIO); > -} /* xlog_grant_log_space */ > - > +} > > /* > * Replenish the byte reservation required by moving the grant write head. > @@ -2605,10 +2667,12 @@ error_return: > * free fast path. > */ > STATIC int > -xlog_regrant_write_log_space(xlog_t *log, > - xlog_ticket_t *tic) > +xlog_regrant_write_log_space( > + struct log *log, > + struct xlog_ticket *tic) > { > - int free_bytes, need_bytes; > + int free_bytes, need_bytes; > + int error = 0; > > tic->t_curr_res = tic->t_unit_res; > xlog_tic_reset_res(tic); > @@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t * > if (tic->t_cnt > 0) > return 0; > > -#ifdef DEBUG > - if (log->l_flags & XLOG_ACTIVE_RECOVERY) > - panic("regrant Recovery problem"); > -#endif > + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); > > trace_xfs_log_regrant_write_enter(log, tic); > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return_unlocked; > > - /* If there are other waiters on the queue then give them a > - * chance at logspace before us. Wake up the first waiters, > - * if we do not wake up all the waiters then go to sleep waiting > - * for more free space, otherwise try to get some space for > - * this transaction. > + /* > + * If there are other waiters on the queue then give them a chance at > + * logspace before us. Wake up the first waiters, if we do not wake > + * up all the waiters then go to sleep waiting for more free space, > + * otherwise try to get some space for this transaction. > */ > need_bytes = tic->t_unit_res; > - if (!list_empty_careful(&log->l_writeq)) { > - struct xlog_ticket *ntic; > - > - spin_lock(&log->l_grant_write_lock); > - free_bytes = xlog_space_left(log, &log->l_grant_write_head); > - list_for_each_entry(ntic, &log->l_writeq, t_queue) { > - ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); > - > - if (free_bytes < ntic->t_unit_res) > - break; > - free_bytes -= ntic->t_unit_res; > - wake_up(&ntic->t_wait); > - } > - > - if (ntic != list_first_entry(&log->l_writeq, > - struct xlog_ticket, t_queue)) { > - if (list_empty(&tic->t_queue)) > - list_add_tail(&tic->t_queue, &log->l_writeq); > - trace_xfs_log_regrant_write_sleep1(log, tic); > - > - xlog_grant_push_ail(log, need_bytes); > - > - XFS_STATS_INC(xs_sleep_logspace); > - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); > - trace_xfs_log_regrant_write_wake1(log, tic); > - } else > - spin_unlock(&log->l_grant_write_lock); > - } > - > -redo: > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return_unlocked; > - > free_bytes = xlog_space_left(log, &log->l_grant_write_head); > - if (free_bytes < need_bytes) { > + if (!list_empty_careful(&log->l_writeq)) { > spin_lock(&log->l_grant_write_lock); > - if (list_empty(&tic->t_queue)) > - list_add_tail(&tic->t_queue, &log->l_writeq); > - > - if (XLOG_FORCED_SHUTDOWN(log)) > - goto error_return; > - > - xlog_grant_push_ail(log, need_bytes); > - > - XFS_STATS_INC(xs_sleep_logspace); > - trace_xfs_log_regrant_write_sleep2(log, tic); > - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); > - > - trace_xfs_log_regrant_write_wake2(log, tic); > - goto redo; > - } > - > - if (!list_empty(&tic->t_queue)) { > + if (!xlog_writeq_wake(log, &free_bytes) || > + free_bytes < need_bytes) > + error = xlog_writeq_wait(log, tic, need_bytes); > + spin_unlock(&log->l_grant_write_lock); > + } else if (free_bytes < need_bytes) { > spin_lock(&log->l_grant_write_lock); > - list_del_init(&tic->t_queue); > + error = xlog_writeq_wait(log, tic, need_bytes); > spin_unlock(&log->l_grant_write_lock); > } > > - /* we've got enough space */ > + if (error) > + return error; > + > xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); > trace_xfs_log_regrant_write_exit(log, tic); > xlog_verify_grant_tail(log); > return 0; > - > - > - error_return_unlocked: > - spin_lock(&log->l_grant_write_lock); > - error_return: > - list_del_init(&tic->t_queue); > - spin_unlock(&log->l_grant_write_lock); > - trace_xfs_log_regrant_write_error(log, tic); > - > - /* > - * If we are failing, make sure the ticket doesn't have any > - * current reservations. We don't want to add this back when > - * the ticket/transaction gets cancelled. > - */ > - tic->t_curr_res = 0; > - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ > - return XFS_ERROR(EIO); > -} /* xlog_regrant_write_log_space */ > - > +} > > /* The first cnt-1 times through here we don't need to > * move the grant write head because the permanent > Index: xfs/fs/xfs/xfs_trace.h > =================================================================== > --- xfs.orig/fs/xfs/xfs_trace.h 2011-11-20 16:29:49.362860654 +0100 > +++ xfs/fs/xfs/xfs_trace.h 2011-11-20 16:34:23.954706395 +0100 > @@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_wri > DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); > DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); > DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); > -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); > -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); > -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); > -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); > +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); > +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); > DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); > -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); > -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); > -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); > -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); > +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); > +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); > DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs > _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs