On Wed, Mar 16, 2016 at 12:44:41PM +0100, Christoph Hellwig wrote: > This patch implements two closely related changes: First it embedds a > bio the ioend structure so that we don't have to allocate one separately. > Second it uses the block layer bio chaining mechanism to chain additional > bios off this first one if needed instead of manually accouting for > multiple bio completions in the ioend structure. Together this removes a > memory allocation per ioend and greatly simplifies the ioend setup and > I/O completion path. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- Thanks for the helper cleanups and the bi_error fix. Looks good to me: Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > fs/xfs/xfs_aops.c | 247 ++++++++++++++++++++++++----------------------------- > fs/xfs/xfs_aops.h | 15 ++-- > fs/xfs/xfs_super.c | 26 ++---- > 3 files changed, 123 insertions(+), 165 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 5928770..42e7368 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -124,18 +124,25 @@ next_bh: > */ > STATIC void > xfs_destroy_ioend( > - struct xfs_ioend *ioend) > + struct xfs_ioend *ioend, > + int error) > { > struct inode *inode = ioend->io_inode; > - int error = ioend->io_error; > + struct bio *last = ioend->io_bio; > struct bio *bio, *next; > > - for (bio = ioend->io_bio_done; bio; bio = next) { > + for (bio = &ioend->io_inline_bio; bio; bio = next) { > struct bio_vec *bvec; > int i; > > - next = bio->bi_private; > - bio->bi_private = NULL; > + /* > + * For the last bio, bi_private points to the ioend, so we > + * need to explicitly end the iteration here. > + */ > + if (bio == last) > + next = NULL; > + else > + next = bio->bi_private; > > /* walk each page on bio, ending page IO on them */ > bio_for_each_segment_all(bvec, bio, i) > @@ -143,8 +150,6 @@ xfs_destroy_ioend( > > bio_put(bio); > } > - > - mempool_free(ioend, xfs_ioend_pool); > } > > /* > @@ -218,7 +223,8 @@ xfs_setfilesize( > > STATIC int > xfs_setfilesize_ioend( > - struct xfs_ioend *ioend) > + struct xfs_ioend *ioend, > + int error) > { > struct xfs_inode *ip = XFS_I(ioend->io_inode); > struct xfs_trans *tp = ioend->io_append_trans; > @@ -232,53 +238,32 @@ xfs_setfilesize_ioend( > __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); > > /* we abort the update if there was an IO error */ > - if (ioend->io_error) { > + if (error) { > xfs_trans_cancel(tp); > - return ioend->io_error; > + return error; > } > > return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); > } > > /* > - * Schedule IO completion handling on the final put of an ioend. > - * > - * If there is no work to do we might as well call it a day and free the > - * ioend right now. > - */ > -STATIC void > -xfs_finish_ioend( > - struct xfs_ioend *ioend) > -{ > - if (atomic_dec_and_test(&ioend->io_remaining)) { > - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; > - > - if (ioend->io_type == XFS_IO_UNWRITTEN) > - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); > - else if (ioend->io_append_trans) > - queue_work(mp->m_data_workqueue, &ioend->io_work); > - else > - xfs_destroy_ioend(ioend); > - } > -} > - > -/* > * IO write completion. > */ > STATIC void > xfs_end_io( > struct work_struct *work) > { > - xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); > - struct xfs_inode *ip = XFS_I(ioend->io_inode); > - int error = 0; > + struct xfs_ioend *ioend = > + container_of(work, struct xfs_ioend, io_work); > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > + int error = ioend->io_bio->bi_error; > > /* > * Set an error if the mount has shut down and proceed with end I/O > * processing so it can perform whatever cleanups are necessary. > */ > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > - ioend->io_error = -EIO; > + error = -EIO; > > /* > * For unwritten extents we need to issue transactions to convert a > @@ -288,50 +273,33 @@ xfs_end_io( > * on error. > */ > if (ioend->io_type == XFS_IO_UNWRITTEN) { > - if (ioend->io_error) > + if (error) > goto done; > error = xfs_iomap_write_unwritten(ip, ioend->io_offset, > ioend->io_size); > } else if (ioend->io_append_trans) { > - error = xfs_setfilesize_ioend(ioend); > + error = xfs_setfilesize_ioend(ioend, error); > } else { > ASSERT(!xfs_ioend_is_append(ioend)); > } > > done: > - if (error) > - ioend->io_error = error; > - xfs_destroy_ioend(ioend); > + xfs_destroy_ioend(ioend, error); > } > > -/* > - * Allocate and initialise an IO completion structure. > - * We need to track unwritten extent write completion here initially. > - * We'll need to extend this for updating the ondisk inode size later > - * (vs. incore size). > - */ > -STATIC xfs_ioend_t * > -xfs_alloc_ioend( > - struct inode *inode, > - unsigned int type) > +STATIC void > +xfs_end_bio( > + struct bio *bio) > { > - xfs_ioend_t *ioend; > - > - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); > - memset(ioend, 0, sizeof(*ioend)); > + struct xfs_ioend *ioend = bio->bi_private; > + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; > > - /* > - * Set the count to 1 initially, which will prevent an I/O > - * completion callback from happening before we have started > - * all the I/O from calling the completion routine too early. > - */ > - atomic_set(&ioend->io_remaining, 1); > - INIT_LIST_HEAD(&ioend->io_list); > - ioend->io_type = type; > - ioend->io_inode = inode; > - INIT_WORK(&ioend->io_work, xfs_end_io); > - spin_lock_init(&ioend->io_lock); > - return ioend; > + if (ioend->io_type == XFS_IO_UNWRITTEN) > + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); > + else if (ioend->io_append_trans) > + queue_work(mp->m_data_workqueue, &ioend->io_work); > + else > + xfs_destroy_ioend(ioend, bio->bi_error); > } > > STATIC int > @@ -403,56 +371,6 @@ xfs_imap_valid( > offset < imap->br_startoff + imap->br_blockcount; > } > > -/* > - * BIO completion handler for buffered IO. > - */ > -STATIC void > -xfs_end_bio( > - struct bio *bio) > -{ > - struct xfs_ioend *ioend = bio->bi_private; > - unsigned long flags; > - > - bio->bi_private = NULL; > - bio->bi_end_io = NULL; > - > - spin_lock_irqsave(&ioend->io_lock, flags); > - if (!ioend->io_error) > - ioend->io_error = bio->bi_error; > - if (!ioend->io_bio_done) > - ioend->io_bio_done = bio; > - else > - ioend->io_bio_done_tail->bi_private = bio; > - ioend->io_bio_done_tail = bio; > - spin_unlock_irqrestore(&ioend->io_lock, flags); > - > - xfs_finish_ioend(ioend); > -} > - > -STATIC void > -xfs_submit_ioend_bio( > - struct writeback_control *wbc, > - xfs_ioend_t *ioend, > - struct bio *bio) > -{ > - atomic_inc(&ioend->io_remaining); > - bio->bi_private = ioend; > - bio->bi_end_io = xfs_end_bio; > - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); > -} > - > -STATIC struct bio * > -xfs_alloc_ioend_bio( > - struct buffer_head *bh) > -{ > - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); > - > - ASSERT(bio->bi_private == NULL); > - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > - bio->bi_bdev = bh->b_bdev; > - return bio; > -} > - > STATIC void > xfs_start_buffer_writeback( > struct buffer_head *bh) > @@ -513,16 +431,19 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) > STATIC int > xfs_submit_ioend( > struct writeback_control *wbc, > - xfs_ioend_t *ioend, > + struct xfs_ioend *ioend, > int status) > { > /* Reserve log space if we might write beyond the on-disk inode size. */ > if (!status && > - ioend->io_bio && ioend->io_type != XFS_IO_UNWRITTEN && > + ioend->io_type != XFS_IO_UNWRITTEN && > xfs_ioend_is_append(ioend) && > !ioend->io_append_trans) > status = xfs_setfilesize_trans_alloc(ioend); > > + ioend->io_bio->bi_private = ioend; > + ioend->io_bio->bi_end_io = xfs_end_bio; > + > /* > * If we are failing the IO now, just mark the ioend with an > * error and finish it. This will run IO completion immediately > @@ -530,19 +451,75 @@ xfs_submit_ioend( > * time. > */ > if (status) { > - if (ioend->io_bio) > - bio_put(ioend->io_bio); > - ioend->io_error = status; > - xfs_finish_ioend(ioend); > + ioend->io_bio->bi_error = status; > + bio_endio(ioend->io_bio); > return status; > } > > - xfs_submit_ioend_bio(wbc, ioend, ioend->io_bio); > - ioend->io_bio = NULL; > - xfs_finish_ioend(ioend); > + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, > + ioend->io_bio); > return 0; > } > > +static void > +xfs_init_bio_from_bh( > + struct bio *bio, > + struct buffer_head *bh) > +{ > + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > +} > + > +static struct xfs_ioend * > +xfs_alloc_ioend( > + struct inode *inode, > + unsigned int type, > + xfs_off_t offset, > + struct buffer_head *bh) > +{ > + struct xfs_ioend *ioend; > + struct bio *bio; > + > + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); > + xfs_init_bio_from_bh(bio, bh); > + > + ioend = container_of(bio, struct xfs_ioend, io_inline_bio); > + INIT_LIST_HEAD(&ioend->io_list); > + ioend->io_type = type; > + ioend->io_inode = inode; > + ioend->io_size = 0; > + ioend->io_offset = offset; > + INIT_WORK(&ioend->io_work, xfs_end_io); > + ioend->io_append_trans = NULL; > + ioend->io_bio = bio; > + return ioend; > +} > + > +/* > + * Allocate a new bio, and chain the old bio to the new one. > + * > + * Note that we have to do perform the chaining in this unintuitive order > + * so that the bi_private linkage is set up in the right direction for the > + * traversal in xfs_destroy_ioend(). > + */ > +static void > +xfs_chain_bio( > + struct xfs_ioend *ioend, > + struct writeback_control *wbc, > + struct buffer_head *bh) > +{ > + struct bio *new; > + > + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); > + xfs_init_bio_from_bh(new, bh); > + > + bio_chain(ioend->io_bio, new); > + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ > + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, > + ioend->io_bio); > + ioend->io_bio = new; > +} > + > /* > * Test to see if we've been building up a completion structure for > * earlier buffers -- if so, we try to append to this ioend if we > @@ -564,19 +541,15 @@ xfs_add_to_ioend( > offset != wpc->ioend->io_offset + wpc->ioend->io_size) { > if (wpc->ioend) > list_add(&wpc->ioend->io_list, iolist); > - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type); > - wpc->ioend->io_offset = offset; > + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); > } > > -retry: > - if (!wpc->ioend->io_bio) > - wpc->ioend->io_bio = xfs_alloc_ioend_bio(bh); > - > - if (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) { > - xfs_submit_ioend_bio(wbc, wpc->ioend, wpc->ioend->io_bio); > - wpc->ioend->io_bio = NULL; > - goto retry; > - } > + /* > + * If the buffer doesn't fit into the bio we need to allocate a new > + * one. This shouldn't happen more than once for a given buffer. > + */ > + while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) > + xfs_chain_bio(wpc->ioend, wbc, bh); > > wpc->ioend->io_size += bh->b_size; > wpc->last_block = bh->b_blocknr; > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index 1c7b041..8b5b641 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -18,7 +18,7 @@ > #ifndef __XFS_AOPS_H__ > #define __XFS_AOPS_H__ > > -extern mempool_t *xfs_ioend_pool; > +extern struct bio_set *xfs_ioend_bioset; > > /* > * Types of I/O for bmap clustering and I/O completion tracking. > @@ -37,24 +37,19 @@ enum { > { XFS_IO_OVERWRITE, "overwrite" } > > /* > - * xfs_ioend struct manages large extent writes for XFS. > - * It can manage several multi-page bio's at once. > + * Structure for buffered I/O completions. > */ > -typedef struct xfs_ioend { > +struct xfs_ioend { > struct list_head io_list; /* next ioend in chain */ > unsigned int io_type; /* delalloc / unwritten */ > - int io_error; /* I/O error code */ > - atomic_t io_remaining; /* hold count */ > struct inode *io_inode; /* file being written to */ > size_t io_size; /* size of the extent */ > xfs_off_t io_offset; /* offset in the file */ > struct work_struct io_work; /* xfsdatad work queue */ > struct xfs_trans *io_append_trans;/* xact. for size update */ > struct bio *io_bio; /* bio being built */ > - struct bio *io_bio_done; /* bios completed */ > - struct bio *io_bio_done_tail; /* bios completed */ > - spinlock_t io_lock; /* for bio completion list */ > -} xfs_ioend_t; > + struct bio io_inline_bio; /* MUST BE LAST! */ > +}; > > extern const struct address_space_operations xfs_address_space_operations; > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index d760934..e52e9c1 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -58,8 +58,7 @@ > #include <linux/parser.h> > > static const struct super_operations xfs_super_operations; > -static kmem_zone_t *xfs_ioend_zone; > -mempool_t *xfs_ioend_pool; > +struct bio_set *xfs_ioend_bioset; > > static struct kset *xfs_kset; /* top-level xfs sysfs dir */ > #ifdef DEBUG > @@ -1688,20 +1687,15 @@ MODULE_ALIAS_FS("xfs"); > STATIC int __init > xfs_init_zones(void) > { > - > - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); > - if (!xfs_ioend_zone) > + xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, > + offsetof(struct xfs_ioend, io_inline_bio)); > + if (!xfs_ioend_bioset) > goto out; > > - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, > - xfs_ioend_zone); > - if (!xfs_ioend_pool) > - goto out_destroy_ioend_zone; > - > xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), > "xfs_log_ticket"); > if (!xfs_log_ticket_zone) > - goto out_destroy_ioend_pool; > + goto out_free_ioend_bioset; > > xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), > "xfs_bmap_free_item"); > @@ -1797,10 +1791,8 @@ xfs_init_zones(void) > kmem_zone_destroy(xfs_bmap_free_item_zone); > out_destroy_log_ticket_zone: > kmem_zone_destroy(xfs_log_ticket_zone); > - out_destroy_ioend_pool: > - mempool_destroy(xfs_ioend_pool); > - out_destroy_ioend_zone: > - kmem_zone_destroy(xfs_ioend_zone); > + out_free_ioend_bioset: > + bioset_free(xfs_ioend_bioset); > out: > return -ENOMEM; > } > @@ -1826,9 +1818,7 @@ xfs_destroy_zones(void) > kmem_zone_destroy(xfs_btree_cur_zone); > kmem_zone_destroy(xfs_bmap_free_item_zone); > kmem_zone_destroy(xfs_log_ticket_zone); > - mempool_destroy(xfs_ioend_pool); > - kmem_zone_destroy(xfs_ioend_zone); > - > + bioset_free(xfs_ioend_bioset); > } > > STATIC int __init > -- > 2.1.4 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs