On Mon, Nov 29, 2010 at 12:38:28PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > The only thing that the grant lock remains to protect is the grant head > manipulations when adding or removing space from the log. These calculations > are already based on atomic variables, so we can already update them safely > without locks. However, the grant head manpulations require atomic multi-step > calculations to be executed, which the algorithms currently don't allow. > > To make these multi-step calculations atomic, convert the algorithms to > compare-and-exchange loops on the atomic variables. That is, we sample the old > value, perform the calculation and use atomic64_cmpxchg() to attempt to update > the head with the new value. If the head has not changed since we sampled it, > it will succeed and we are done. Otherwise, we rerun the calculation again from > a new sample of the head. > > This allows us to remove the grant lock from around all the grant head space > manipulations, and that effectively removes the grant lock from the log > completely. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/xfs_log.c | 133 ++++++++++++++++++++++++++++++------------------------ > 1 files changed, 74 insertions(+), 59 deletions(-) > > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index 8365496..50cf6af 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -111,6 +111,13 @@ STATIC int xlog_iclogs_empty(xlog_t *log); > * is the byte offset into the log for the marker. Unlike the xfs_lsn_t, this > * is held in bytes rather than basic blocks, even though it uses the > * BLOCK_LSN() macro to extract it. > + * > + * We use a lock-free compare and exchange algorithm to atomically update > + * the grant head. That is, we sample the current head, crack it, perform the > + * calculation, recombine it into a new value, and then conditionally set the > + * value back into the atomic variable only if it hasn't changed since we first > + * sampled it. This provides atomic updates of the head, even though we do > + * non-atomic, multi-step calculation to arrive at the new value. > */ > static void > __xlog_grant_sub_space( > @@ -119,16 +126,23 @@ __xlog_grant_sub_space( > int logsize) > { > xfs_lsn_t head_lsn = atomic64_read(head); > - int cycle, space; > + xfs_lsn_t old, new; > > - cycle = CYCLE_LSN(head_lsn); > - space = BLOCK_LSN(head_lsn); > - space -= bytes; > - if (space < 0) { > - cycle--; > - space += logsize; > - } > - atomic64_set(head, xlog_assign_lsn(cycle, space)); > + do { > + int cycle, space; > + > + cycle = CYCLE_LSN(head_lsn); > + space = BLOCK_LSN(head_lsn); > + space -= bytes; > + if (space < 0) { > + cycle--; > + space += logsize; > + } > + > + old = head_lsn; > + new = xlog_assign_lsn(cycle, space); > + head_lsn = atomic64_cmpxchg(head, old, new); > + } while (head_lsn != old); > } > > static void > @@ -138,18 +152,25 @@ __xlog_grant_add_space( > int logsize) > { > xfs_lsn_t head_lsn = atomic64_read(head); > - int cycle, space, tmp; > + xfs_lsn_t old, new; > > - cycle = CYCLE_LSN(head_lsn); > - space = BLOCK_LSN(head_lsn); > - tmp = logsize - space; > - if (tmp > bytes) > - space += bytes; > - else { > - cycle++; > - space = bytes - tmp; > - } > - atomic64_set(head, xlog_assign_lsn(cycle, space)); > + do { > + int cycle, space, tmp; > + > + cycle = CYCLE_LSN(head_lsn); > + space = BLOCK_LSN(head_lsn); > + tmp = logsize - space; > + if (tmp > bytes) > + space += bytes; > + else { > + cycle++; > + space = bytes - tmp; > + } > + > + old = head_lsn; > + new = xlog_assign_lsn(cycle, space); > + head_lsn = atomic64_cmpxchg(head, old, new); > + } while (head_lsn != old); > } > > static inline void > @@ -360,11 +381,9 @@ xfs_log_reserve( > > trace_xfs_log_reserve(log, internal_ticket); > > - spin_lock(&log->l_grant_lock); > xlog_grant_push_ail(log, atomic64_read(&log->l_tail_lsn), > atomic64_read(&log->l_last_sync_lsn), > internal_ticket->t_unit_res); > - spin_unlock(&log->l_grant_lock); > retval = xlog_regrant_write_log_space(log, internal_ticket); > } else { > /* may sleep if need to allocate more tickets */ > @@ -378,12 +397,10 @@ xfs_log_reserve( > > trace_xfs_log_reserve(log, internal_ticket); > > - spin_lock(&log->l_grant_lock); > xlog_grant_push_ail(log, atomic64_read(&log->l_tail_lsn), > atomic64_read(&log->l_last_sync_lsn), > (internal_ticket->t_unit_res * > internal_ticket->t_cnt)); > - spin_unlock(&log->l_grant_lock); > retval = xlog_grant_log_space(log, internal_ticket); > } > > @@ -1376,9 +1393,7 @@ xlog_sync(xlog_t *log, > roundoff < BBTOB(1))); > > /* move grant heads by roundoff in sync */ > - spin_lock(&log->l_grant_lock); > xlog_grant_add_space(log, roundoff); > - spin_unlock(&log->l_grant_lock); > > /* put cycle number in every block */ > xlog_pack_data(log, iclog, roundoff); > @@ -2618,13 +2633,11 @@ redo: > } > > /* we've got enough space */ > - spin_lock(&log->l_grant_lock); > xlog_grant_add_space(log, need_bytes); > > trace_xfs_log_grant_exit(log, tic); > xlog_verify_grant_tail(log); > xlog_verify_grant_head(log, 1); > - spin_unlock(&log->l_grant_lock); > return 0; > > error_return: > @@ -2752,13 +2765,11 @@ redo: > } > > /* we've got enough space */ > - spin_lock(&log->l_grant_lock); > xlog_grant_add_space_write(log, need_bytes); > > trace_xfs_log_regrant_write_exit(log, tic); > xlog_verify_grant_tail(log); > xlog_verify_grant_head(log, 1); > - spin_unlock(&log->l_grant_lock); > return 0; > > > @@ -2779,23 +2790,23 @@ redo: > } > > > -/* The first cnt-1 times through here we don't need to > - * move the grant write head because the permanent > - * reservation has reserved cnt times the unit amount. > - * Release part of current permanent unit reservation and > - * reset current reservation to be one units worth. Also > - * move grant reservation head forward. > +/* > + * The first cnt-1 times through here we don't need to move the grant write > + * head because the permanent reservation has reserved cnt times the unit > + * amount. Release part of current permanent unit reservation and reset > + * current reservation to be one units worth. Also move grant reservation head > + * forward. > */ > STATIC void > -xlog_regrant_reserve_log_space(xlog_t *log, > - xlog_ticket_t *ticket) > +xlog_regrant_reserve_log_space( > + struct log *log, > + struct xlog_ticket *ticket) > { > trace_xfs_log_regrant_reserve_enter(log, ticket); > > if (ticket->t_cnt > 0) > ticket->t_cnt--; > > - spin_lock(&log->l_grant_lock); > xlog_grant_sub_space(log, ticket->t_curr_res); > ticket->t_curr_res = ticket->t_unit_res; > xlog_tic_reset_res(ticket); > @@ -2805,20 +2816,17 @@ xlog_regrant_reserve_log_space(xlog_t *log, > xlog_verify_grant_head(log, 1); > > /* just return if we still have some of the pre-reserved space */ > - if (ticket->t_cnt > 0) { > - spin_unlock(&log->l_grant_lock); > + if (ticket->t_cnt > 0) > return; > - } > > xlog_grant_add_space_reserve(log, ticket->t_unit_res); > > trace_xfs_log_regrant_reserve_exit(log, ticket); > > xlog_verify_grant_head(log, 0); > - spin_unlock(&log->l_grant_lock); > ticket->t_curr_res = ticket->t_unit_res; > xlog_tic_reset_res(ticket); > -} /* xlog_regrant_reserve_log_space */ > +} > > > /* > @@ -2836,34 +2844,33 @@ xlog_regrant_reserve_log_space(xlog_t *log, > * in the current reservation field. > */ > STATIC void > -xlog_ungrant_log_space(xlog_t *log, > - xlog_ticket_t *ticket) > +xlog_ungrant_log_space( > + struct log *log, > + struct xlog_ticket *ticket) > { > - if (ticket->t_cnt > 0) > - ticket->t_cnt--; > + int space; > > - spin_lock(&log->l_grant_lock); > trace_xfs_log_ungrant_enter(log, ticket); > + if (ticket->t_cnt > 0) > + ticket->t_cnt--; > > - xlog_grant_sub_space(log, ticket->t_curr_res); > - > - trace_xfs_log_ungrant_sub(log, ticket); > - > - /* If this is a permanent reservation ticket, we may be able to free > + /* > + * If this is a permanent reservation ticket, we may be able to free > * up more space based on the remaining count. > */ > + space = ticket->t_curr_res; > if (ticket->t_cnt > 0) { > ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); > - xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); > + space += ticket->t_unit_res * ticket->t_cnt; > } > + trace_xfs_log_ungrant_sub(log, ticket); > + xlog_grant_sub_space(log, space); > > trace_xfs_log_ungrant_exit(log, ticket); > - > xlog_verify_grant_head(log, 1); > - spin_unlock(&log->l_grant_lock); > - xfs_log_move_tail(log->l_mp, 1); > -} /* xlog_ungrant_log_space */ > > + xfs_log_move_tail(log->l_mp, 1); > +} > > /* > * Flush iclog to disk if this is the last reference to the given iclog and > @@ -3478,11 +3485,18 @@ xlog_verify_dest_ptr( > xlog_panic("xlog_verify_dest_ptr: invalid ptr"); > } > > +/* > + * XXX: because we cannot atomically sample both the reserve and write heads, > + * we cannot verify them reliably as they may be sampled in the middle of > + * racing modifications. Hence just taking snapshots of the heads can give an > + * incorrect view of the state of log. Hence just disable this check for now. > + */ > STATIC void > xlog_verify_grant_head( I can't see any way to keep this check with the atomic reserve/write heads, so we might as well remove it entirely. Otherwise the patch looks good, Reviewed-by: Christoph Hellwig <hch@xxxxxx> _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs