Vivek, does CFQ still need any hints for this sort of handoff? On Tue, Mar 27, 2012 at 08:46:45PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Doing background CIL flushes adds significant latency to whatever > async transaction that triggers it. To avoid blocking async > transactions on things like waiting for log buffer IO to complete, > move the CIL push off into a workqueue. By moving the push work > into a workqueue, we remove all the latency that the commit adds > from the foreground transaction commit path. This also means that > single threaded workloads won't do the CIL push procssing, leaving > them more CPU to do more async transactions. > > To do this, we need to keep track of the sequence number we have > pushed work for. This avoids having many transaction commits > attempting to schedule work for the same sequence, and ensures that > we only ever have one push (background or forced) in progress at a > time. It also means that we don't need to take the CIL lock in write > mode to check for potential background push races, which reduces > lock contention. > > To avoid potential issues with "smart" IO schedulers, don't use the > workqueue for log force triggered flushes. Instead, do them directly > so that the log IO is done directly by the process issuing the log > force and so doesn't get stuck on IO elevator queue idling > incorrectly delaying the log IO from the workqueue. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/xfs_log_cil.c | 241 ++++++++++++++++++++++++++++++------------------- > fs/xfs/xfs_log_priv.h | 4 + > fs/xfs/xfs_super.c | 6 ++ > 3 files changed, 158 insertions(+), 93 deletions(-) > > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c > index d4fadbe..6a5a7ba 100644 > --- a/fs/xfs/xfs_log_cil.c > +++ b/fs/xfs/xfs_log_cil.c > @@ -31,57 +31,7 @@ > #include "xfs_alloc.h" > #include "xfs_discard.h" > > -/* > - * Perform initial CIL structure initialisation. > - */ > -int > -xlog_cil_init( > - struct log *log) > -{ > - struct xfs_cil *cil; > - struct xfs_cil_ctx *ctx; > - > - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); > - if (!cil) > - return ENOMEM; > - > - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); > - if (!ctx) { > - kmem_free(cil); > - return ENOMEM; > - } > - > - INIT_LIST_HEAD(&cil->xc_cil); > - INIT_LIST_HEAD(&cil->xc_committing); > - spin_lock_init(&cil->xc_cil_lock); > - init_rwsem(&cil->xc_ctx_lock); > - init_waitqueue_head(&cil->xc_commit_wait); > - > - INIT_LIST_HEAD(&ctx->committing); > - INIT_LIST_HEAD(&ctx->busy_extents); > - ctx->sequence = 1; > - ctx->cil = cil; > - cil->xc_ctx = ctx; > - cil->xc_current_sequence = ctx->sequence; > - > - cil->xc_log = log; > - log->l_cilp = cil; > - return 0; > -} > - > -void > -xlog_cil_destroy( > - struct log *log) > -{ > - if (log->l_cilp->xc_ctx) { > - if (log->l_cilp->xc_ctx->ticket) > - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); > - kmem_free(log->l_cilp->xc_ctx); > - } > - > - ASSERT(list_empty(&log->l_cilp->xc_cil)); > - kmem_free(log->l_cilp); > -} > +struct workqueue_struct *xfs_cil_wq; > > /* > * Allocate a new ticket. Failing to get a new ticket makes it really hard to > @@ -426,8 +376,7 @@ xlog_cil_committed( > */ > STATIC int > xlog_cil_push( > - struct log *log, > - xfs_lsn_t push_seq) > + struct log *log) > { > struct xfs_cil *cil = log->l_cilp; > struct xfs_log_vec *lv; > @@ -443,39 +392,35 @@ xlog_cil_push( > struct xfs_log_iovec lhdr; > struct xfs_log_vec lvhdr = { NULL }; > xfs_lsn_t commit_lsn; > + xfs_lsn_t push_seq; > > if (!cil) > return 0; > > - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); > - > new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); > new_ctx->ticket = xlog_cil_ticket_alloc(log); > > - /* > - * Lock out transaction commit, but don't block for background pushes > - * unless we are well over the CIL space limit. See the definition of > - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic > - * used here. > - */ > - if (!down_write_trylock(&cil->xc_ctx_lock)) { > - if (!push_seq && > - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) > - goto out_free_ticket; > - down_write(&cil->xc_ctx_lock); > - } > + down_write(&cil->xc_ctx_lock); > ctx = cil->xc_ctx; > > - /* check if we've anything to push */ > - if (list_empty(&cil->xc_cil)) > - goto out_skip; > + spin_lock(&cil->xc_cil_lock); > + push_seq = cil->xc_push_seq; > + ASSERT(push_seq > 0 && push_seq <= ctx->sequence); > > - /* check for spurious background flush */ > - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) > + /* > + * Check if we've anything to push. If there is nothing, then we don't > + * move on to a new sequence number and so we have to be able to push > + * this sequence again later. > + */ > + if (list_empty(&cil->xc_cil)) { > + cil->xc_push_seq = 0; > + spin_unlock(&cil->xc_cil_lock); > goto out_skip; > + } > + spin_unlock(&cil->xc_cil_lock); > > /* check for a previously pushed seqeunce */ > - if (push_seq && push_seq < cil->xc_ctx->sequence) > + if (push_seq < cil->xc_ctx->sequence) > goto out_skip; > > /* > @@ -629,7 +574,6 @@ restart: > > out_skip: > up_write(&cil->xc_ctx_lock); > -out_free_ticket: > xfs_log_ticket_put(new_ctx->ticket); > kmem_free(new_ctx); > return 0; > @@ -641,6 +585,80 @@ out_abort: > return XFS_ERROR(EIO); > } > > +static void > +xlog_cil_push_work( > + struct work_struct *work) > +{ > + struct xfs_cil *cil = container_of(work, struct xfs_cil, > + xc_push_work); > + xlog_cil_push(cil->xc_log); > +} > + > +/* > + * We need to push CIL every so often so we don't cache more than we can fit in > + * the log. The limit really is that a checkpoint can't be more than half the > + * log (the current checkpoint is not allowed to overwrite the previous > + * checkpoint), but commit latency and memory usage limit this to a smaller > + * size. > + */ > +static void > +xlog_cil_push_background( > + struct log *log) > +{ > + struct xfs_cil *cil = log->l_cilp; > + > + /* > + * The cil won't be empty because we are called while holding the > + * context lock so whatever we added to the CIL will still be there > + */ > + ASSERT(!list_empty(&cil->xc_cil)); > + > + /* > + * don't do a background push if we haven't used up all the > + * space available yet. > + */ > + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) > + return; > + > + spin_lock(&cil->xc_cil_lock); > + cil->xc_push_seq = cil->xc_current_sequence; > + queue_work(xfs_cil_wq, &cil->xc_push_work); > + spin_unlock(&cil->xc_cil_lock); > + > +} > + > +static void > +xlog_cil_push_foreground( > + struct log *log, > + xfs_lsn_t push_seq) > +{ > + struct xfs_cil *cil = log->l_cilp; > + > + if (!cil) > + return; > + > + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); > + > + /* start on any pending background push to minimise wait time on it */ > + flush_work(&cil->xc_push_work); > + > + /* > + * If the CIL is empty or we've already pushed the sequence then > + * there's no work we need to do. > + */ > + spin_lock(&cil->xc_cil_lock); > + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { > + spin_unlock(&cil->xc_cil_lock); > + return; > + } > + > + cil->xc_push_seq = push_seq; > + spin_unlock(&cil->xc_cil_lock); > + > + /* do the push now */ > + xlog_cil_push(log); > +} > + > /* > * Commit a transaction with the given vector to the Committed Item List. > * > @@ -667,7 +685,6 @@ xfs_log_commit_cil( > { > struct log *log = mp->m_log; > int log_flags = 0; > - int push = 0; > struct xfs_log_vec *log_vector; > > if (flags & XFS_TRANS_RELEASE_LOG_RES) > @@ -719,21 +736,9 @@ xfs_log_commit_cil( > */ > xfs_trans_free_items(tp, *commit_lsn, 0); > > - /* check for background commit before unlock */ > - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) > - push = 1; > + xlog_cil_push_background(log); > > up_read(&log->l_cilp->xc_ctx_lock); > - > - /* > - * We need to push CIL every so often so we don't cache more than we > - * can fit in the log. The limit really is that a checkpoint can't be > - * more than half the log (the current checkpoint is not allowed to > - * overwrite the previous checkpoint), but commit latency and memory > - * usage limit this to a smaller size in most cases. > - */ > - if (push) > - xlog_cil_push(log, 0); > return 0; > } > > @@ -746,9 +751,6 @@ xfs_log_commit_cil( > * > * We return the current commit lsn to allow the callers to determine if a > * iclog flush is necessary following this call. > - * > - * XXX: Initially, just push the CIL unconditionally and return whatever > - * commit lsn is there. It'll be empty, so this is broken for now. > */ > xfs_lsn_t > xlog_cil_force_lsn( > @@ -766,8 +768,7 @@ xlog_cil_force_lsn( > * xlog_cil_push() handles racing pushes for the same sequence, > * so no need to deal with it here. > */ > - if (sequence == cil->xc_current_sequence) > - xlog_cil_push(log, sequence); > + xlog_cil_push_foreground(log, sequence); > > /* > * See if we can find a previous sequence still committing. > @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt( > return false; > return true; > } > + > +/* > + * Perform initial CIL structure initialisation. > + */ > +int > +xlog_cil_init( > + struct log *log) > +{ > + struct xfs_cil *cil; > + struct xfs_cil_ctx *ctx; > + > + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); > + if (!cil) > + return ENOMEM; > + > + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); > + if (!ctx) { > + kmem_free(cil); > + return ENOMEM; > + } > + > + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); > + INIT_LIST_HEAD(&cil->xc_cil); > + INIT_LIST_HEAD(&cil->xc_committing); > + spin_lock_init(&cil->xc_cil_lock); > + init_rwsem(&cil->xc_ctx_lock); > + init_waitqueue_head(&cil->xc_commit_wait); > + > + INIT_LIST_HEAD(&ctx->committing); > + INIT_LIST_HEAD(&ctx->busy_extents); > + ctx->sequence = 1; > + ctx->cil = cil; > + cil->xc_ctx = ctx; > + cil->xc_current_sequence = ctx->sequence; > + > + cil->xc_log = log; > + log->l_cilp = cil; > + return 0; > +} > + > +void > +xlog_cil_destroy( > + struct log *log) > +{ > + if (log->l_cilp->xc_ctx) { > + if (log->l_cilp->xc_ctx->ticket) > + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); > + kmem_free(log->l_cilp->xc_ctx); > + } > + > + ASSERT(list_empty(&log->l_cilp->xc_cil)); > + kmem_free(log->l_cilp); > +} > + > diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h > index 2152900..ea8c076 100644 > --- a/fs/xfs/xfs_log_priv.h > +++ b/fs/xfs/xfs_log_priv.h > @@ -417,8 +417,12 @@ struct xfs_cil { > struct list_head xc_committing; > wait_queue_head_t xc_commit_wait; > xfs_lsn_t xc_current_sequence; > + struct work_struct xc_push_work; > + xfs_lsn_t xc_push_seq; > }; > > +extern struct workqueue_struct *xfs_cil_wq; > + > /* > * The amount of log space we allow the CIL to aggregate is difficult to size. > * Whatever we choose, we have to make sure we can get a reservation for the > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index aef50ab..c5059f5 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -1634,8 +1634,13 @@ xfs_init_workqueues(void) > if (!xfs_alloc_wq) > goto out_destroy_syncd; > > + xfs_cil_wq = alloc_workqueue("xfscwcilalloc", WQ_MEM_RECLAIM, 0); > + if (!xfs_cil_wq) > + goto out_destroy_alloc; > return 0; > > +out_destroy_alloc: > + destroy_workqueue(xfs_alloc_wq); > out_destroy_syncd: > destroy_workqueue(xfs_syncd_wq); > return -ENOMEM; > @@ -1644,6 +1649,7 @@ out_destroy_syncd: > STATIC void > xfs_destroy_workqueues(void) > { > + destroy_workqueue(xfs_cil_wq); > destroy_workqueue(xfs_alloc_wq); > destroy_workqueue(xfs_syncd_wq); > } > -- > 1.7.9 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs ---end quoted text--- _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs