The lockless -> locking log grant head reservation algorithm is somewhat racy. If multiple tasks check the grant head at the same time, they use the same free space calculation and potentially consume more reservation than is available based on the current log tail. This leads to occasional warnings from xlog_verify_grant_tail() on debug kernels. This is acceptable because the algorithm is racy by design for performance reasons and because an occasional tail overrun by the grant head(s) is not a critical error. However, grant tail overruns are not infrequent on certain systems with capability of high concurrency and smaller logs. With the oneshot nature of the grant tail warning disabled, grant overruns are prevalent and nearly continuous. While this still may not be a critical error in and of itself, it can be avoided with minimal cost and fairly minor changes to the algorithm. The lockless grant head update algorithm is already sensitive to concurrency in that the function retries if another task has updated the grant head between the time the current task sampled it, updated the value and attempts an atomic cmpxchg. To mitigate excessive tail overruns, repeat the free space check on each retry of the head update and if it fails, retry the head check such that the task can be queued. This technically can reorder some transactions, but only with respect to transactions that are already racing in the lockless algorithm, which is already racy enough to not provide any kind of predictable per-task ordering. This significantly reduces the frequency of grant tail overruns on highly concurrent workloads against filesystems with tiny logs without perturbation of the same workload on default size logs. Signed-off-by: Brian Foster <bfoster@xxxxxxxxxx> --- fs/xfs/xfs_log.c | 47 ++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_trace.h | 1 + 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ce0aac89e675..19b1c2ab4661 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,10 @@ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); +STATIC int +__xlog_space_left( + struct xlog *log, + int64_t head_val); STATIC void xlog_dealloc_log( struct xlog *log); @@ -139,19 +143,24 @@ xlog_grant_sub_space( } while (head_val != old); } -static void +static bool xlog_grant_add_space( struct xlog *log, atomic64_t *head, - int bytes) + int bytes, + bool check_free) { int64_t head_val = atomic64_read(head); - int64_t new, old; + int64_t new, old, free_bytes; do { int tmp; int cycle, space; + free_bytes = __xlog_space_left(log, head_val); + if (check_free && bytes > free_bytes) + return false; + xlog_crack_grant_head_val(head_val, &cycle, &space); tmp = log->l_logsize - space; @@ -166,6 +175,8 @@ xlog_grant_add_space( new = xlog_assign_grant_head_val(cycle, space); head_val = atomic64_cmpxchg(head, old, new); } while (head_val != old); + + return true; } STATIC void @@ -326,6 +337,7 @@ xlog_grant_head_check( * up all the waiters then go to sleep waiting for more free space, * otherwise try to get some space for this transaction. */ +retry: *need_bytes = xlog_ticket_reservation(log, head, tic); free_bytes = xlog_space_left(log, &head->grant); if (!list_empty_careful(&head->waiters)) { @@ -342,8 +354,12 @@ xlog_grant_head_check( spin_unlock(&head->lock); } - if (!error) - xlog_grant_add_space(log, &head->grant, *need_bytes); + if (!error) { + if (!xlog_grant_add_space(log, &head->grant, *need_bytes, true)) { + trace_xfs_log_grant_retry(log, tic); + goto retry; + } + } return error; } @@ -470,7 +486,7 @@ xfs_log_reserve( if (error) goto out_error; - xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes, false); trace_xfs_log_reserve_exit(log, tic); xlog_verify_grant_tail(log); return 0; @@ -1186,9 +1202,9 @@ xlog_assign_tail_lsn( * result is that we return the size of the log as the amount of space left. */ STATIC int -xlog_space_left( +__xlog_space_left( struct xlog *log, - atomic64_t *head) + int64_t head_val) { int free_bytes; int tail_bytes; @@ -1196,7 +1212,7 @@ xlog_space_left( int head_cycle; int head_bytes; - xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_grant_head_val(head_val, &head_cycle, &head_bytes); xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); tail_bytes = BBTOB(tail_bytes); if (tail_cycle == head_cycle && head_bytes >= tail_bytes) @@ -1225,6 +1241,13 @@ xlog_space_left( return free_bytes; } +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head) +{ + return __xlog_space_left(log, atomic64_read(head)); +} static void xlog_ioend_work( @@ -1871,8 +1894,8 @@ xlog_sync( count = xlog_calc_iclog_size(log, iclog, &roundoff); /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff, false); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff, false); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); @@ -3107,7 +3130,7 @@ xlog_regrant_reserve_log_space( return; xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); + ticket->t_unit_res, false); trace_xfs_log_regrant_reserve_exit(log, ticket); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..5ff096325567 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1001,6 +1001,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_retry); DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); -- 2.20.1