From: Dave Chinner <dchinner@xxxxxxxxxx> The first step to replacing the dabuf infrastructure is to introduce a buffer construct that can support multiple block ranges in a single mapping. Directory buffers back be made up of multiple extents, but are currently formed by creating individual buffers and them copying the data out of them into a dabuf structure. All dabuf operations then require walking all the underlying buffers to change the state of the underlying buffers, and once a dabuf is modified the contents need to be copied back to the underlying buffers before they are logged. All of these operations can be done on a normal xfs_buf, but the normal xfs_buf does not support multiple disk block ranges or doing multiple disjoint I/Os to read or write a buffer. Supporting multiple disk block ranges is not difficult - we simply need to attach an iovec-like array to the buffer rather than just using a single block number and length. Splitting the buffer up into multiple IOs for read and write is not difficult, either. We already track the number of IO remaining to complete an IO, so this can be used to wait for the multiple IO dispatched to complete (for both read and write). The only interesting twist to this is logging the changes. We can treat the compound buffer as a single buffer for most purposes except for formatting the changes into the log. When formatting, we need to split the changes into a format item per underlying region so that recovery does not need to know about compound buffers and can recover each segment of a directory block indivdually as it does now. The fact that recovery will replay all or none of the transaction nesures this process is still atomic from a change recovery point of view. This new sort of buffer will be known as a "compound buffer", and will be tagged with a flag to indicate it is such. Compound buffers will be indexed and cached by the block number of their initial segment and the length of the entire buffer. Introduce the compound buffer flags and the block vector infrastructure needed to replace the existing block number indexing. Further patches will introduce the real compound buffer functionality. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/xfs_buf.c | 19 +++++++++++++------ fs/xfs/xfs_buf.h | 17 ++++++++++++++--- fs/xfs/xfs_trace.h | 8 ++++---- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 44cf63c..152e855 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -201,7 +201,12 @@ xfs_buf_alloc( bp->b_flags = flags; /* XXX: we have the block number. Why don't we just set it here? */ - bp->b_bn = XFS_BUF_DADDR_NULL; + /* initialise the buffer IO vector array appropriately */ + bp->b_vec_count = 1; + bp->b_vec = &bp->b_vec_array[0]; + bp->b_vec[0].bv_bn = XFS_BUF_DADDR_NULL; + bp->b_vec[0].bv_len = bp->b_buffer_length; + atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -564,7 +569,8 @@ xfs_buf_get( * Now we have a workable buffer, fill in the block number so * that we can do IO on it. */ - bp->b_bn = blkno; + bp->b_vec[0].bv_bn = blkno; + bp->b_vec[0].bv_len = bp->b_buffer_length; bp->b_count_desired = bp->b_buffer_length; found: @@ -596,7 +602,7 @@ _xfs_buf_read( int status; ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_vec[0].bv_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -711,7 +717,8 @@ xfs_buf_set_empty( bp->b_addr = NULL; bp->b_file_offset = 0; bp->b_buffer_length = bp->b_count_desired = numblks << BBSHIFT; - bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_vec[0].bv_bn = XFS_BUF_DADDR_NULL; + bp->b_vec[0].bv_len = bp->b_buffer_length; bp->b_flags &= ~XBF_MAPPED; } @@ -1185,7 +1192,7 @@ _xfs_buf_ioapply( struct bio *bio; int offset = bp->b_offset; int size = bp->b_count_desired; - sector_t sector = bp->b_bn; + sector_t sector = bp->b_vec[0].bv_bn; total_nr_pages = bp->b_page_count; map_i = 0; @@ -1678,7 +1685,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_bn - bp->b_bn; + diff = ap->b_vec[0].bv_bn - bp->b_vec[0].bv_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index a1c078d..1a3367e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -67,6 +67,7 @@ typedef enum { #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ typedef unsigned int xfs_buf_flags_t; @@ -121,6 +122,12 @@ typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); #define XB_PAGES 2 +struct xfs_buf_vec { + xfs_daddr_t bv_bn; /* block number for I/O */ + size_t bv_len; /* size of I/O */ +}; +#define XB_VECS 2 + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -142,7 +149,6 @@ typedef struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - xfs_daddr_t b_bn; /* block number for I/O */ size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; @@ -158,6 +164,11 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + + struct xfs_buf_vec *b_vec; /* compound buffer vector */ + struct xfs_buf_vec b_vec_array[XB_VECS]; /* inline vectors */ + int b_vec_count; /* size of vector array */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif @@ -260,8 +271,8 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ADDR(bp) ((bp)->b_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_ADDR(bp) ((bp)->b_vec[0].bv_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_vec[0].bv_bn = (xfs_daddr_t)(bno)) #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) #define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) #define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f1d2802..40d1f9c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -290,7 +290,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + __entry->bno = bp->b_vec[0].bv_bn; __entry->buffer_length = bp->b_buffer_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -361,7 +361,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + __entry->bno = bp->b_vec[0].bv_bn; __entry->buffer_length = bp->b_buffer_length; __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); @@ -405,7 +405,7 @@ TRACE_EVENT(xfs_buf_ioerror, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + __entry->bno = bp->b_vec[0].bv_bn; __entry->buffer_length = bp->b_buffer_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -449,7 +449,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_flags = bip->bli_flags; __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); - __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_bno = bip->bli_buf->b_vec[0].bv_bn; __entry->buf_len = bip->bli_buf->b_buffer_length; __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); -- 1.7.5.4 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs