On Fri, Mar 05, 2021 at 04:11:39PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Adding a list_sort() call to the CIL push work while the xc_ctx_lock > is held exclusively has resulted in fairly long lock hold times and > that stops all front end transaction commits from making progress. Heh, nice solution. :) > We can move the sorting out of the xc_ctx_lock if we can transfer > the ordering information to the log vectors as they are detached > from the log items and then we can sort the log vectors. This > requires log vectors to use a list_head rather than a single linked > list Ergh, could pull out the list conversion into a separate piece? Some of the lv_chain usage is ... not entirely textbook. > and to hold an order ID field. With these changes, we can move > the list_sort() call to just before we call xlog_write() when we > aren't holding any locks at all. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/xfs_log.c | 46 +++++++++++++++++++++--------- > fs/xfs/xfs_log.h | 3 +- > fs/xfs/xfs_log_cil.c | 63 +++++++++++++++++++++++++---------------- > fs/xfs/xfs_log_priv.h | 4 +-- > fs/xfs/xfs_trans.c | 4 +-- > fs/xfs/xfs_trans_priv.h | 4 +-- > 6 files changed, 78 insertions(+), 46 deletions(-) > > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index 46a006d41184..fd58c3213ebf 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -846,6 +846,9 @@ xlog_write_unmount_record( > .lv_niovecs = 1, > .lv_iovecp = ®, > }; > + LIST_HEAD(lv_chain); > + INIT_LIST_HEAD(&vec.lv_chain); > + list_add(&vec.lv_chain, &lv_chain); > > /* account for space used by record data */ > ticket->t_curr_res -= sizeof(unmount_rec); > @@ -857,8 +860,8 @@ xlog_write_unmount_record( > */ > if (log->l_targ != log->l_mp->m_ddev_targp) > blkdev_issue_flush(log->l_targ->bt_bdev); > - return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS, > - reg.i_len); > + return xlog_write(log, &lv_chain, ticket, NULL, NULL, > + XLOG_UNMOUNT_TRANS, reg.i_len); > } > > /* > @@ -1571,14 +1574,17 @@ xlog_commit_record( > .lv_iovecp = ®, > }; > int error; > + LIST_HEAD(lv_chain); > + INIT_LIST_HEAD(&vec.lv_chain); > + list_add(&vec.lv_chain, &lv_chain); > > if (XLOG_FORCED_SHUTDOWN(log)) > return -EIO; > > /* account for space used by record data */ > ticket->t_curr_res -= reg.i_len; > - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, > - reg.i_len); > + error = xlog_write(log, &lv_chain, ticket, lsn, iclog, > + XLOG_COMMIT_TRANS, reg.i_len); > if (error) > xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); > return error; > @@ -2109,6 +2115,7 @@ xlog_print_trans( > */ > static struct xfs_log_vec * > xlog_write_single( > + struct list_head *lv_chain, > struct xfs_log_vec *log_vector, > struct xlog_ticket *ticket, > struct xlog_in_core *iclog, > @@ -2117,7 +2124,7 @@ xlog_write_single( > uint32_t *record_cnt, > uint32_t *data_cnt) > { > - struct xfs_log_vec *lv = log_vector; > + struct xfs_log_vec *lv; > void *ptr; > int index; > > @@ -2125,10 +2132,13 @@ xlog_write_single( > iclog->ic_state == XLOG_STATE_WANT_SYNC); > > ptr = iclog->ic_datap + *log_offset; > - for (lv = log_vector; lv; lv = lv->lv_next) { > + for (lv = log_vector; > + !list_entry_is_head(lv, lv_chain, lv_chain); > + lv = list_next_entry(lv, lv_chain)) { > /* > - * If the entire log vec does not fit in the iclog, punt it to > - * the partial copy loop which can handle this case. > + * If the log vec contains data that needs to be copied and does > + * not entirely fit in the iclog, punt it to the partial copy > + * loop which can handle this case. > */ > if (lv->lv_niovecs && > lv->lv_bytes > iclog->ic_size - *log_offset) > @@ -2154,6 +2164,8 @@ xlog_write_single( > *data_cnt += reg->i_len; > } > } > + if (list_entry_is_head(lv, lv_chain, lv_chain)) > + lv = NULL; > ASSERT(*len == 0 || lv); > return lv; > } > @@ -2199,6 +2211,7 @@ xlog_write_get_more_iclog_space( > static struct xfs_log_vec * > xlog_write_partial( > struct xlog *log, > + struct list_head *lv_chain, > struct xfs_log_vec *log_vector, > struct xlog_ticket *ticket, > struct xlog_in_core **iclogp, > @@ -2338,7 +2351,10 @@ xlog_write_partial( > * the caller so it can go back to fast path copying. > */ > *iclogp = iclog; > - return lv->lv_next; > + lv = list_next_entry(lv, lv_chain); > + if (list_entry_is_head(lv, lv_chain, lv_chain)) > + return NULL; > + return lv; > } > > /* > @@ -2384,7 +2400,7 @@ xlog_write_partial( > int > xlog_write( > struct xlog *log, > - struct xfs_log_vec *log_vector, > + struct list_head *lv_chain, > struct xlog_ticket *ticket, > xfs_lsn_t *start_lsn, > struct xlog_in_core **commit_iclog, > @@ -2392,7 +2408,7 @@ xlog_write( > uint32_t len) > { > struct xlog_in_core *iclog = NULL; > - struct xfs_log_vec *lv = log_vector; > + struct xfs_log_vec *lv; > int record_cnt = 0; > int data_cnt = 0; > int error = 0; > @@ -2424,15 +2440,17 @@ xlog_write( > if (optype & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) > iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); > > + lv = list_first_entry_or_null(lv_chain, struct xfs_log_vec, lv_chain); > while (lv) { > - lv = xlog_write_single(lv, ticket, iclog, &log_offset, > + lv = xlog_write_single(lv_chain, lv, ticket, iclog, &log_offset, > &len, &record_cnt, &data_cnt); > if (!lv) > break; > > ASSERT(!(optype & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))); > - lv = xlog_write_partial(log, lv, ticket, &iclog, &log_offset, > - &len, &record_cnt, &data_cnt); > + lv = xlog_write_partial(log, lv_chain, lv, ticket, &iclog, > + &log_offset, &len, &record_cnt, > + &data_cnt); > if (IS_ERR_OR_NULL(lv)) { > error = PTR_ERR_OR_ZERO(lv); > break; > diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h > index af54ea3f8c90..0445dd6acbce 100644 > --- a/fs/xfs/xfs_log.h > +++ b/fs/xfs/xfs_log.h > @@ -9,7 +9,8 @@ > struct xfs_cil_ctx; > > struct xfs_log_vec { > - struct xfs_log_vec *lv_next; /* next lv in build list */ > + struct list_head lv_chain; /* lv chain ptrs */ > + int lv_order_id; /* chain ordering info */ uint32_t to match li_order_id? > int lv_niovecs; /* number of iovecs in lv */ > struct xfs_log_iovec *lv_iovecp; /* iovec array */ > struct xfs_log_item *lv_item; /* owner */ > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c > index 3d43a5088154..6dcc23829bef 100644 > --- a/fs/xfs/xfs_log_cil.c > +++ b/fs/xfs/xfs_log_cil.c > @@ -72,6 +72,7 @@ xlog_cil_ctx_alloc(void) > ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); > INIT_LIST_HEAD(&ctx->committing); > INIT_LIST_HEAD(&ctx->busy_extents); > + INIT_LIST_HEAD(&ctx->lv_chain); > INIT_WORK(&ctx->push_work, xlog_cil_push_work); > return ctx; > } > @@ -237,6 +238,7 @@ xlog_cil_alloc_shadow_bufs( > lv = kmem_alloc_large(buf_size, KM_NOFS); > memset(lv, 0, xlog_cil_iovec_space(niovecs)); > > + INIT_LIST_HEAD(&lv->lv_chain); > lv->lv_item = lip; > lv->lv_size = buf_size; > if (ordered) > @@ -252,7 +254,6 @@ xlog_cil_alloc_shadow_bufs( > else > lv->lv_buf_len = 0; > lv->lv_bytes = 0; > - lv->lv_next = NULL; > } > > /* Ensure the lv is set up according to ->iop_size */ > @@ -379,8 +380,6 @@ xlog_cil_insert_format_items( > if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { > /* same or smaller, optimise common overwrite case */ > lv = lip->li_lv; > - lv->lv_next = NULL; What /did/ these null assignments do? > - > if (ordered) > goto insert; > > @@ -547,14 +546,14 @@ xlog_cil_insert_items( > > static void > xlog_cil_free_logvec( > - struct xfs_log_vec *log_vector) > + struct list_head *lv_chain) > { > struct xfs_log_vec *lv; > > - for (lv = log_vector; lv; ) { > - struct xfs_log_vec *next = lv->lv_next; > + while(!list_empty(lv_chain)) { Nit: space after "while". > + lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_chain); > + list_del_init(&lv->lv_chain); > kmem_free(lv); > - lv = next; > } > } > > @@ -653,7 +652,7 @@ xlog_cil_committed( > spin_unlock(&ctx->cil->xc_push_lock); > } > > - xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, > + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, > ctx->start_lsn, abort); > > xfs_extent_busy_sort(&ctx->busy_extents); > @@ -664,7 +663,7 @@ xlog_cil_committed( > list_del(&ctx->committing); > spin_unlock(&ctx->cil->xc_push_lock); > > - xlog_cil_free_logvec(ctx->lv_chain); > + xlog_cil_free_logvec(&ctx->lv_chain); > > if (!list_empty(&ctx->busy_extents)) > xlog_discard_busy_extents(mp, ctx); > @@ -744,7 +743,7 @@ xlog_cil_build_trans_hdr( > lvhdr->lv_niovecs = 2; > lvhdr->lv_iovecp = &hdr->lhdr[0]; > lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; > - lvhdr->lv_next = ctx->lv_chain; > + list_add(&lvhdr->lv_chain, &ctx->lv_chain); > > tic->t_curr_res -= lvhdr->lv_bytes; > } > @@ -755,12 +754,14 @@ xlog_cil_order_cmp( > struct list_head *a, > struct list_head *b) > { > - struct xfs_log_item *l1 = container_of(a, struct xfs_log_item, li_cil); > - struct xfs_log_item *l2 = container_of(b, struct xfs_log_item, li_cil); > + struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, > + lv_chain); > + struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, > + lv_chain); > > - if (l1->li_order_id > l2->li_order_id) > + if (l1->lv_order_id > l2->lv_order_id) > return 1; > - if (l1->li_order_id < l2->li_order_id) > + if (l1->lv_order_id < l2->lv_order_id) > return -1; > return 0; > } > @@ -907,26 +908,25 @@ xlog_cil_push_work( > * needed on the transaction commit side which is currently locked out > * by the flush lock. > */ > - list_sort(NULL, &log_items, xlog_cil_order_cmp); > lv = NULL; > while (!list_empty(&log_items)) { > struct xfs_log_item *item; > > item = list_first_entry(&log_items, > struct xfs_log_item, li_cil); > - list_del_init(&item->li_cil); > - item->li_order_id = 0; > - if (!ctx->lv_chain) > - ctx->lv_chain = item->li_lv; > - else > - lv->lv_next = item->li_lv; > + > lv = item->li_lv; > - item->li_lv = NULL; > + lv->lv_order_id = item->li_order_id; > num_iovecs += lv->lv_niovecs; > - > /* we don't write ordered log vectors */ > if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) > num_bytes += lv->lv_bytes; > + list_add_tail(&lv->lv_chain, &ctx->lv_chain); > + > + list_del_init(&item->li_cil); Do the list manipulations need moving, or could they have stayed further up in the loop body for a cleaner patch? > + item->li_order_id = 0; > + item->li_lv = NULL; > + > } > > /* > @@ -959,6 +959,13 @@ xlog_cil_push_work( > spin_unlock(&cil->xc_push_lock); > up_write(&cil->xc_ctx_lock); > > + /* > + * Sort the log vector chain before we add the transaction headers. > + * This ensures we always have the transaction headers at the start > + * of the chain. > + */ > + list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp); > + > /* > * Build a checkpoint transaction header and write it to the log to > * begin the transaction. We need to account for the space used by the > @@ -981,8 +988,14 @@ xlog_cil_push_work( > * use the commit record lsn then we can move the tail beyond the grant > * write head. > */ > - error = xlog_write(log, &lvhdr, ctx->ticket, &ctx->start_lsn, NULL, > - XLOG_START_TRANS, num_bytes); > + error = xlog_write(log, &ctx->lv_chain, ctx->ticket, &ctx->start_lsn, > + NULL, XLOG_START_TRANS, num_bytes); > + > + /* > + * Take the lvhdr back off the lv_chain as it should not be passed > + * to log IO completion. > + */ > + list_del(&lvhdr.lv_chain); > if (error) > goto out_abort_free_ticket; > > diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h > index 12a1a36eef7e..6a4160200417 100644 > --- a/fs/xfs/xfs_log_priv.h > +++ b/fs/xfs/xfs_log_priv.h > @@ -224,7 +224,7 @@ struct xfs_cil_ctx { > int nvecs; /* number of regions */ > atomic_t space_used; /* aggregate size of regions */ > struct list_head busy_extents; /* busy extents in chkpt */ > - struct xfs_log_vec *lv_chain; /* logvecs being pushed */ > + struct list_head lv_chain; /* logvecs being pushed */ > struct list_head iclog_entry; > struct list_head committing; /* ctx committing list */ > struct work_struct discard_endio_work; > @@ -480,7 +480,7 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) > > void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); > void xlog_print_trans(struct xfs_trans *); > -int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, > +int xlog_write(struct xlog *log, struct list_head *lv_chain, > struct xlog_ticket *tic, xfs_lsn_t *start_lsn, > struct xlog_in_core **commit_iclog, uint optype, uint32_t len); > int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, > diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c > index 83c2b7f22eb7..b20e68279808 100644 > --- a/fs/xfs/xfs_trans.c > +++ b/fs/xfs/xfs_trans.c > @@ -747,7 +747,7 @@ xfs_log_item_batch_insert( > void > xfs_trans_committed_bulk( > struct xfs_ail *ailp, > - struct xfs_log_vec *log_vector, > + struct list_head *lv_chain, > xfs_lsn_t commit_lsn, > bool aborted) > { > @@ -762,7 +762,7 @@ xfs_trans_committed_bulk( > spin_unlock(&ailp->ail_lock); > > /* unpin all the log items */ > - for (lv = log_vector; lv; lv = lv->lv_next ) { > + list_for_each_entry(lv, lv_chain, lv_chain) { > struct xfs_log_item *lip = lv->lv_item; > xfs_lsn_t item_lsn; > > diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h > index 3004aeac9110..b0bf78e6ff76 100644 > --- a/fs/xfs/xfs_trans_priv.h > +++ b/fs/xfs/xfs_trans_priv.h > @@ -18,8 +18,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); > void xfs_trans_del_item(struct xfs_log_item *); > void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); > > -void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, > - xfs_lsn_t commit_lsn, bool aborted); > +void xfs_trans_committed_bulk(struct xfs_ail *ailp, > + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted); > /* > * AIL traversal cursor. > * > -- > 2.28.0 >