Re: [PATCH 16/18] xfs: use a b+tree for the in-core extent list

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 31, 2017 at 04:22:28PM +0200, Christoph Hellwig wrote:
> Replace the current linear list and the indirection array for the in-core
> extent list with a b+tree to avoid the need for larger memory allocations
> for the indirection array when lots of extents are present.  The current
> extent list implementations leads to heavy pressure on the memory
> allocator when modifying files with a high extent count, and can lead
> to high latencies because of that.
> 
> The replacement is a b+tree with a few quirks.  The leaf nodes directly
> store the extent record in two u64 values.  The encoding is a little bit
> different from the existing in-core extent records so that the start
> offset and length which are required for lookups can be retreived with
> simple mask operations.  The inner nodes store a 64-bit key containing
> the start offset in the first half of the node, and the pointers to the
> next lower level in the second half.  In either case we walk the node
> from the beginninig to the end and do a linear search, as that is more
> efficient for the low number of cache lines touched during a search
> (2 for the inner nodes, 4 for the leaf nodes) than a binary search.
> We store termination markers (zero length for the leaf nodes, an
> otherwise impossible high bit for the inner nodes) to terminate the key
> list / records instead of storing a count to use the available cache
> lines as efficiently as possible.
> 
> One quirk of the algorithm is that while we normally split a node half and
> half like usual btree implementations we just spill over entries added at
> the very end of the list to a new node on its own.  This means we get a
> 100% fill grade for the common cases of bulk inseration at reading an
> inode into memory, and when only sequentially appending to a file.  The
> downside is a slightly higher chance of splits on the first random
> inserations.
> 
> Both insert and removal manually recurse into the lower levels, but
> the bulk deletion of the whole tree is still implemented as a recursive
> function call, although one limited by the overall depth and with very
> little stack usage in every iteration.
> 
> For the first few extents we dynamically grow the list from a single
> extent to the next powers of two until we have a first full leaf block
> and that building the actual tree.
> 
> The code started out based on the generic lib/btree.c code from Joern
> Engel based on earlier work from Peter Zijlstra, but has since been
> rewritten beyond recognition.

Ok, time for a real review.

> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/xfs/Makefile                |    1 +
>  fs/xfs/libxfs/xfs_bmap.c       |   19 +-
>  fs/xfs/libxfs/xfs_bmap_btree.c |  103 +---
>  fs/xfs/libxfs/xfs_bmap_btree.h |    7 +-
>  fs/xfs/libxfs/xfs_format.h     |    4 -
>  fs/xfs/libxfs/xfs_iext_tree.c  | 1043 ++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_inode_fork.c | 1035 +--------------------------------------
>  fs/xfs/libxfs/xfs_inode_fork.h |   84 +---
>  fs/xfs/libxfs/xfs_types.h      |    3 +-
>  fs/xfs/scrub/bmap.c            |    5 +-
>  fs/xfs/xfs_inode.c             |    2 +-
>  fs/xfs/xfs_inode_item.c        |    2 -
>  fs/xfs/xfs_trace.h             |   51 +-
>  13 files changed, 1103 insertions(+), 1256 deletions(-)
>  create mode 100644 fs/xfs/libxfs/xfs_iext_tree.c
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index a2a5d046793d..7ceb41a9786a 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -49,6 +49,7 @@ xfs-y				+= $(addprefix libxfs/, \
>  				   xfs_dquot_buf.o \
>  				   xfs_ialloc.o \
>  				   xfs_ialloc_btree.o \
> +				   xfs_iext_tree.o \
>  				   xfs_inode_fork.o \
>  				   xfs_inode_buf.o \
>  				   xfs_log_rlimit.o \
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index e020bd3f8717..46bda00dca79 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -805,6 +805,8 @@ xfs_bmap_local_to_extents_empty(
>  	xfs_bmap_forkoff_reset(ip, whichfork);
>  	ifp->if_flags &= ~XFS_IFINLINE;
>  	ifp->if_flags |= XFS_IFEXTENTS;
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height = 0;
>  	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
>  }
>  
> @@ -846,8 +848,7 @@ xfs_bmap_local_to_extents(
>  
>  	flags = 0;
>  	error = 0;
> -	ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
> -								XFS_IFINLINE);
> +	ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE);
>  	memset(&args, 0, sizeof(args));
>  	args.tp = tp;
>  	args.mp = ip->i_mount;
> @@ -891,6 +892,9 @@ xfs_bmap_local_to_extents(
>  	xfs_bmap_local_to_extents_empty(ip, whichfork);
>  	flags |= XFS_ILOG_CORE;
>  
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height = 0;
> +
>  	rec.br_startoff = 0;
>  	rec.br_startblock = args.fsbno;
>  	rec.br_blockcount = 1;
> @@ -1177,6 +1181,7 @@ xfs_iread_extents(
>  	xfs_extnum_t		nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
>  	struct xfs_btree_block	*block = ifp->if_broot;
>  	struct xfs_iext_cursor	ext;
> +	struct xfs_bmbt_irec	new;
>  	xfs_fsblock_t		bno;
>  	struct xfs_buf		*bp;
>  	xfs_extnum_t		i, j;
> @@ -1193,7 +1198,8 @@ xfs_iread_extents(
>  
>  	ifp->if_bytes = 0;
>  	ifp->if_real_bytes = 0;
> -	xfs_iext_add(ifp, 0, nextents);
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height = 0;
>  
>  	/*
>  	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
> @@ -1258,16 +1264,15 @@ xfs_iread_extents(
>  		 * Copy records into the extent records.
>  		 */
>  		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
> -		for (j = 0; j < num_recs; j++, i++, frp++) {
> -			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
> +		for (j = 0; j < num_recs; j++, frp++, i++) {
>  			if (!xfs_bmbt_validate_extent(mp, whichfork, frp)) {
>  				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
>  						 XFS_ERRLEVEL_LOW, mp);
>  				error = -EFSCORRUPTED;
>  				goto out_brelse;
>  			}
> -			trp->l0 = be64_to_cpu(frp->l0);
> -			trp->l1 = be64_to_cpu(frp->l1);
> +			xfs_bmbt_disk_get_all(frp, &new);
> +			xfs_iext_insert(ip, &ext, 1, &new, state);
>  			trace_xfs_read_extent(ip, &ext, state, _THIS_IP_);
>  			xfs_iext_next(ifp, &ext);
>  		}
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
> index 89260972a0f6..c10aecaaae44 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -71,73 +71,21 @@ xfs_bmdr_to_bmbt(
>  	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
>  }
>  
> -/*
> - * Convert a compressed bmap extent record to an uncompressed form.
> - * This code must be in sync with the routines xfs_bmbt_get_startoff,
> - * xfs_bmbt_get_startblock and xfs_bmbt_get_blockcount.
> - */
> -STATIC void
> -__xfs_bmbt_get_all(
> -		uint64_t l0,
> -		uint64_t l1,
> -		xfs_bmbt_irec_t *s)
> -{
> -	int	ext_flag;
> -	xfs_exntst_t st;
> -
> -	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
> -	s->br_startoff = ((xfs_fileoff_t)l0 &
> -			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
> -	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
> -			   (((xfs_fsblock_t)l1) >> 21);
> -	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
> -	/* This is xfs_extent_state() in-line */
> -	if (ext_flag) {
> -		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
> -		st = XFS_EXT_UNWRITTEN;
> -	} else
> -		st = XFS_EXT_NORM;
> -	s->br_state = st;
> -}
> -
>  void
> -xfs_bmbt_get_all(
> -	xfs_bmbt_rec_host_t *r,
> -	xfs_bmbt_irec_t *s)
> -{
> -	__xfs_bmbt_get_all(r->l0, r->l1, s);
> -}
> -
> -/*
> - * Extract the blockcount field from an in memory bmap extent record.
> - */
> -xfs_filblks_t
> -xfs_bmbt_get_blockcount(
> -	xfs_bmbt_rec_host_t	*r)
> -{
> -	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
> -}
> -
> -/*
> - * Extract the startblock field from an in memory bmap extent record.
> - */
> -xfs_fsblock_t
> -xfs_bmbt_get_startblock(
> -	xfs_bmbt_rec_host_t	*r)
> -{
> -	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
> -	       (((xfs_fsblock_t)r->l1) >> 21);
> -}
> -
> -/*
> - * Extract the startoff field from an in memory bmap extent record.
> - */
> -xfs_fileoff_t
> -xfs_bmbt_get_startoff(
> -	xfs_bmbt_rec_host_t	*r)
> -{
> -	return ((xfs_fileoff_t)r->l0 &
> -		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
> +xfs_bmbt_disk_get_all(
> +	struct xfs_bmbt_rec	*rec,
> +	struct xfs_bmbt_irec	*irec)
> +{
> +	uint64_t		l0 = get_unaligned_be64(&rec->l0);
> +	uint64_t		l1 = get_unaligned_be64(&rec->l1);
> +
> +	irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
> +	irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21);
> +	irec->br_blockcount = l1 & xfs_mask64lo(21);
> +	if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN))
> +		irec->br_state = XFS_EXT_UNWRITTEN;
> +	else
> +		irec->br_state = XFS_EXT_NORM;
>  }
>  
>  /*
> @@ -161,29 +109,6 @@ xfs_bmbt_disk_get_startoff(
>  		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
>  }
>  
> -/*
> - * Set all the fields in a bmap extent record from the uncompressed form.
> - */
> -void
> -xfs_bmbt_set_all(
> -	struct xfs_bmbt_rec_host *r,
> -	struct xfs_bmbt_irec	*s)
> -{
> -	int			extent_flag = (s->br_state != XFS_EXT_NORM);
> -
> -	ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN);
> -	ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)));
> -	ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)));
> -	ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)));
> -
> -	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
> -		 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
> -		 ((xfs_bmbt_rec_base_t)s->br_startblock >> 43);
> -	r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
> -		 ((xfs_bmbt_rec_base_t)s->br_blockcount &
> -		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
> -}
> -
>  /*
>   * Set all the fields in a bmap extent record from the uncompressed form.
>   */
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
> index 2fbfe2a24b15..714bfbaf9b2d 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.h
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.h
> @@ -98,16 +98,11 @@ struct xfs_trans;
>   */
>  extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
>  			struct xfs_btree_block *, int);
> -extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
> -extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
> -extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
> -extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
>  
>  void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
>  extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
>  extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
> -
> -extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
> +extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
>  
>  extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
>  			xfs_bmdr_block_t *, int);
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 6470dfa768ee..28d5391d2272 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -1553,10 +1553,6 @@ typedef struct xfs_bmbt_rec {
>  typedef uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
>  typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
>  
> -typedef struct xfs_bmbt_rec_host {
> -	uint64_t		l0, l1;
> -} xfs_bmbt_rec_host_t;
> -
>  /*
>   * Values and macros for delayed-allocation startblock fields.
>   */
> diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
> new file mode 100644
> index 000000000000..acb66c0677e7
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_iext_tree.c
> @@ -0,0 +1,1043 @@
> +/*
> + * Copyright (c) 2017 Christoph Hellwig.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/cache.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include "xfs.h"
> +#include "xfs_format.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_inode.h"
> +#include "xfs_inode_fork.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_trace.h"
> +
> +/*
> + * In-core extent record layout:
> + *
> + * +-------+----------------------------+
> + * | 00:51 | all 52 bits of startoff    |
> + * | 52:63 | low 12 bits of startblock  |
> + * +-------+----------------------------+
> + * | 00:20 | all 21 bits of length      |
> + * |    21 | unwritten extent bit       |
> + * | 22:63 | high 42 bits of startblock |
> + * +-------+----------------------------+
> + */
> +#define XFS_IEXT_STARTOFF_MASK		xfs_mask64lo(52)
> +#define XFS_IEXT_LENGTH_MASK		xfs_mask64lo(21)
> +#define XFS_IEXT_STARTBLOCK_MASK	xfs_mask64lo(54)
> +
> +struct xfs_iext_rec {
> +	uint64_t			lo;
> +	uint64_t			hi;
> +};
> +
> +/*
> + * Given that the length can't be a zero, only an empty hi value indicates an
> + * unused record.
> + */
> +static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
> +{
> +	return rec->hi == 0;
> +}
> +
> +static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
> +{
> +	rec->lo = 0;
> +	rec->hi = 0;
> +}
> +
> +static void
> +xfs_iext_set(
> +	struct xfs_iext_rec	*rec,
> +	struct xfs_bmbt_irec	*irec)
> +{
> +	ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
> +	ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
> +	ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
> +
> +	rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
> +	rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
> +
> +	rec->lo |= (irec->br_startblock << 52);
> +	rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(12)) << (22 - 12));
> +
> +	if (irec->br_state == XFS_EXT_UNWRITTEN)
> +		rec->hi |= (1 << 21);
> +}
> +
> +static void
> +xfs_iext_get(
> +	struct xfs_bmbt_irec	*irec,
> +	struct xfs_iext_rec	*rec)
> +{
> +	irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
> +	irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
> +
> +	irec->br_startblock = rec->lo >> 52;
> +	irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 12);
> +
> +	if (rec->hi & (1 << 21))
> +		irec->br_state = XFS_EXT_UNWRITTEN;
> +	else
> +		irec->br_state = XFS_EXT_NORM;
> +}
> +
> +enum {
> +	NODE_SIZE	= L1_CACHE_BYTES * 4,
> +	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
> +	RECS_PER_LEAF	= (NODE_SIZE - sizeof(uint64_t) - sizeof(uint64_t)) /
> +				sizeof(struct xfs_iext_rec),
> +};
> +
> +/*
> + * In-core extent btree block layout:
> + *
> + * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
> + *
> + * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
> + * contain the startoffset, blockcount, startblock and unwritten extent flag.
> + * See above for the exact format, followed by pointers to the previous and next
> + * leaf blocks (if there are any).
> + *
> + * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
> + * by an equal number of pointers to the btree blocks at the next lower level.
> + *
> + * Note that we currently always allocate 64-bits worth for pointers in the
> + * inner nodes or the link pointers in the leaf nodes even on 32-bit

We do?  sizeof(void *) == 4 on 32-bit; on a i686 machine this evaluates to....

NODE_SIZE = 64 * 4,
KEYS_PER_NODE = 256 / (8 + 4) = 21

While on x64 this becomes:

KEYS_PER_NODE = 256 / (8 + 8) = 16

...right?

RECS_PER_LEAF = (256 - 8 - 8) / 16 = 15

Does RECS_PER_LEAF reserve two uint64_t for the prev and next pointers
in struct xfs_iext_leaf?  It'd be a little more clear if the definition
was:

RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(void *))) / sizeof(xfs_iext_rec)

Frankly I wonder why not just have:

(NODE_SIZE - sizeof(xfs_iext_leaf_tail)) / sizeof(iext_rec)

Though I suppose practically speaking none of this changes RECS_PER_LEAF.

> + * architectures, so that we can use consistent addressing using and array of
> + * 64-bit unsigned integers.  If someone still cares for 32-bit architectures
> + * this could be optimized a bit better for them.
> + *
> + *		+-------+-------+-------+-------+-------+----------+----------+
> + * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
> + *		+-------+-------+-------+-------+-------+----------+----------+
> + *
> + *		+-------+-------+-------+-------+-------+-------+------+-------+
> + * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
> + *		+-------+-------+-------+-------+-------+-------+------+-------+
> + */
> +struct xfs_iext_node {
> +	uint64_t		keys[KEYS_PER_NODE];
> +#define XFS_IEXT_KEY_INVALID	(1ULL << 63)
> +	void			*ptrs[KEYS_PER_NODE];
> +};

Hm, ok, let's do the math here.  The data forks can have at most 2^32
extents, and the attr fork can have at most 2^16 extents.

The L1 cache sizes range from 4 bytes (h8300) to 256 bytes (s390), with
x64 at 64 bytes and arm64/ppc64 at 128.

This means that this is broken on h8300 because it's not possible to
create a leaf containing even one record.  Not that even a 32-byte leaf
is acceptable in terms of overhead.  Perhaps NODE_SIZE should have a
minimum?

For the data fork on x64, the largest our tree can get is:

2^32 max extent records
RECS_PER_LEAF = 15
KEYS_PER_NODE = 16

level 0 = 286,331,154 leaf blocks
level 1 = 17,895,698 nodes
level 2 = 1,118,482 nodes
level 3 = 69,906 nodes
level 4 = 4,370 nodes
level 5 = 274 nodes
level 6 = 18 nodes
level 7 = 2 nodes
level 8 = 1 node
== 305,419,904 blocks, or 78GB of RAM?  The leaves eat 68G, the nodes eat 10G.

Now let's try s390:

RECS_PER_LEAF = 63
KEYS_PER_NODE = 64

level 0 = 68,174,085 leaf blocks
level 1 = 1,065,221 nodes
level 2 = 16,645 nodes
level 3 = 261 nodes
level 4 = 5 nodes
level 5 = 1 node
== 69,256,218 blocks, or 71GB, or 3G tree overhead.

Hm, I suppose ~80G of RAM to map ~16T of space isn't bad, assuming
each extent maps a single 4k block.

I guess it's pretty horrible if each block is instead 512b.

However, prior to this code we'd just fall over dead. :)

> +
> +struct xfs_iext_leaf {
> +	struct xfs_iext_rec	recs[RECS_PER_LEAF];
> +	struct xfs_iext_leaf	*prev;
> +	struct xfs_iext_leaf	*next;
> +};
> +
> +inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)

static inline?

> +{
> +	return ifp->if_bytes / sizeof(struct xfs_iext_rec);
> +}
> +
> +static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
> +{
> +	if (ifp->if_height == 1)
> +		return xfs_iext_count(ifp);
> +	return RECS_PER_LEAF;
> +}
> +
> +static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
> +{
> +	return &cur->leaf->recs[cur->pos];
> +}
> +
> +static noinline bool xfs_iext_valid(struct xfs_ifork *ifp,
> +		struct xfs_iext_cursor *cur)
> +{
> +	if (cur->pos < 0)
> +		return false;
> +	if (cur->pos >= xfs_iext_max_recs(ifp))
> +		return false;
> +	if (!cur->leaf)
> +		return false;
> +	if (xfs_iext_rec_is_empty(cur_rec(cur)))
> +		return false;
> +	return true;
> +}
> +
> +static void *
> +xfs_iext_find_first_leaf(
> +	struct xfs_ifork	*ifp)
> +{
> +	struct xfs_iext_node	*node = ifp->if_u1.if_root;
> +	int			height;
> +
> +	if (!ifp->if_height)
> +		return NULL;
> +
> +	for (height = ifp->if_height; height > 1; height--) {
> +		node = node->ptrs[0];
> +		ASSERT(node);
> +	}
> +
> +	return node;
> +}
> +
> +static void *
> +xfs_iext_find_last_leaf(
> +	struct xfs_ifork	*ifp)
> +{
> +	struct xfs_iext_node	*node = ifp->if_u1.if_root;
> +	int			height, i;
> +
> +	if (!ifp->if_height)
> +		return NULL;
> +
> +	for (height = ifp->if_height; height > 1; height--) {
> +		for (i = 1; i < KEYS_PER_NODE; i++)
> +			if (!node->ptrs[i])
> +				break;
> +		node = node->ptrs[i - 1];
> +		ASSERT(node);
> +	}
> +
> +	return node;
> +}
> +
> +void
> +xfs_iext_first(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	cur->pos = 0;
> +	cur->leaf = xfs_iext_find_first_leaf(ifp);
> +}
> +
> +void
> +xfs_iext_last(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	int			i;
> +
> +	cur->leaf = xfs_iext_find_last_leaf(ifp);
> +	if (!cur->leaf) {
> +		cur->pos = 0;
> +		return;
> +	}
> +
> +	for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
> +		if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
> +			break;
> +	}
> +	cur->pos = i - 1;
> +}
> +
> +void
> +xfs_iext_next(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	if (!cur->leaf) {
> +		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
> +		xfs_iext_first(ifp, cur);
> +		return;
> +	}
> +
> +	ASSERT(cur->pos >= 0);
> +	ASSERT(cur->pos < xfs_iext_max_recs(ifp));
> +
> +	cur->pos++;
> +	if (!xfs_iext_valid(ifp, cur) && ifp->if_height > 1 &&
> +	    cur->leaf && cur->leaf->next) {
> +		cur->leaf = cur->leaf->next;
> +		cur->pos = 0;
> +	}
> +}
> +
> +void
> +xfs_iext_prev(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	if (!cur->leaf) {
> +		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
> +		xfs_iext_last(ifp, cur);
> +		return;
> +	}
> +
> +	ASSERT(cur->pos >= 0);
> +	ASSERT(cur->pos <= RECS_PER_LEAF);
> +
> +recurse:
> +	do {
> +		cur->pos--;
> +		if (xfs_iext_valid(ifp, cur))
> +			return;
> +	} while (cur->pos > 0);
> +
> +	if (ifp->if_height > 1 && cur->leaf->prev) {
> +		cur->leaf = cur->leaf->prev;
> +		cur->pos = RECS_PER_LEAF;
> +		goto recurse;
> +	}
> +}
> +
> +static inline int
> +xfs_iext_key_cmp(
> +	struct xfs_iext_node	*node,
> +	int			n,
> +	xfs_fileoff_t		offset)
> +{
> +	if (node->keys[n] > offset)
> +		return 1;
> +	if (node->keys[n] < offset)
> +		return -1;
> +	return 0;
> +}
> +
> +static inline int
> +xfs_iext_rec_cmp(
> +	struct xfs_iext_rec	*rec,
> +	xfs_fileoff_t		offset)
> +{
> +	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
> +	u32			rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
> +
> +	if (rec_offset > offset)
> +		return 1;
> +	if (rec_offset + rec_len <= offset)
> +		return -1;
> +	return 0;
> +}
> +
> +static void *
> +xfs_iext_find_level(
> +	struct xfs_ifork	*ifp,
> +	xfs_fileoff_t		offset,
> +	int			level)
> +{
> +	struct xfs_iext_node	*node = ifp->if_u1.if_root;
> +	int			height, i;
> +
> +	if (!ifp->if_height)
> +		return NULL;
> +
> +	for (height = ifp->if_height; height > level; height--) {
> +		for (i = 1; i < KEYS_PER_NODE; i++)
> +			if (xfs_iext_key_cmp(node, i, offset) > 0)
> +				break;
> +
> +		node = node->ptrs[i - 1];
> +		if (!node)
> +			break;
> +	}
> +
> +	return node;
> +}
> +
> +static int
> +xfs_iext_node_pos(
> +	struct xfs_iext_node	*node,
> +	xfs_fileoff_t		offset)
> +{
> +	int			i;
> +
> +	for (i = 1; i < KEYS_PER_NODE; i++) {
> +		if (xfs_iext_key_cmp(node, i, offset) > 0)
> +			break;
> +	}
> +
> +	return i - 1;
> +}
> +
> +static int
> +xfs_iext_node_insert_pos(
> +	struct xfs_iext_node	*node,
> +	xfs_fileoff_t		offset)
> +{
> +	int			i;
> +
> +	for (i = 0; i < KEYS_PER_NODE; i++) {
> +		if (xfs_iext_key_cmp(node, i, offset) > 0)
> +			return i;
> +	}
> +
> +	return KEYS_PER_NODE;
> +}
> +
> +static int
> +xfs_iext_node_nr_entries(
> +	struct xfs_iext_node	*node,
> +	int			start)
> +{
> +	int			i;
> +
> +	for (i = start; i < KEYS_PER_NODE; i++) {
> +		if (node->keys[i] == XFS_IEXT_KEY_INVALID)
> +			break;
> +	}
> +
> +	return i;
> +}
> +
> +static int
> +xfs_iext_leaf_nr_entries(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_leaf	*leaf,
> +	int			start)
> +{
> +	int			i;
> +
> +	for (i = start; i < xfs_iext_max_recs(ifp); i++) {
> +		if (xfs_iext_rec_is_empty(&leaf->recs[i]))
> +			break;
> +	}
> +
> +	return i;
> +}
> +
> +static inline uint64_t
> +xfs_iext_leaf_key(
> +	struct xfs_iext_leaf	*leaf,
> +	int			n)
> +{
> +	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
> +}
> +
> +static void
> +xfs_iext_grow(
> +	struct xfs_ifork	*ifp)
> +{
> +	struct xfs_iext_node	*node = kmem_zalloc(NODE_SIZE, KM_NOFS);
> +	int			i;
> +
> +	if (ifp->if_height == 1) {
> +		struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
> +
> +		node->keys[0] = xfs_iext_leaf_key(prev, 0);
> +		node->ptrs[0] = prev;
> +	} else  {
> +		struct xfs_iext_node *prev = ifp->if_u1.if_root;
> +
> +		ASSERT(ifp->if_height > 1);
> +
> +		node->keys[0] = prev->keys[0];
> +		node->ptrs[0] = prev;
> +	}
> +
> +	for (i = 1; i < KEYS_PER_NODE; i++)
> +		node->keys[i] = XFS_IEXT_KEY_INVALID;
> +
> +	ifp->if_u1.if_root = node;
> +	ifp->if_height++;
> +}
> +
> +static void
> +xfs_iext_update_node(
> +	struct xfs_ifork	*ifp,
> +	xfs_fileoff_t		old_offset,
> +	xfs_fileoff_t		new_offset,
> +	int			level,
> +	void			*ptr)
> +{
> +	struct xfs_iext_node	*node = ifp->if_u1.if_root;
> +	int			height, i;
> +
> +	for (height = ifp->if_height; height > level; height--) {
> +		for (i = 0; i < KEYS_PER_NODE; i++) {
> +			if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
> +				break;
> +			if (node->keys[i] == old_offset)
> +				node->keys[i] = new_offset;
> +		}
> +		node = node->ptrs[i - 1];
> +		ASSERT(node);
> +	}
> +
> +	ASSERT(node == ptr);
> +}
> +
> +static struct xfs_iext_node *
> +xfs_iext_split_node(
> +	struct xfs_iext_node	**nodep,
> +	int			*pos,
> +	int			*nr_entries)
> +{
> +	struct xfs_iext_node	*node = *nodep;
> +	struct xfs_iext_node	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
> +	const int		nr_move = KEYS_PER_NODE / 2;
> +	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
> +	int			i = 0;
> +
> +	/* for sequential append operations just spill over into the new node */
> +	if (*pos == KEYS_PER_NODE) {
> +		*nodep = new;
> +		*pos = 0;
> +		*nr_entries = 0;
> +		goto done;
> +	}
> +
> +
> +	for (i = 0; i < nr_move; i++) {
> +		new->keys[i] = node->keys[nr_keep + i];
> +		new->ptrs[i] = node->ptrs[nr_keep + i];
> +
> +		node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
> +		node->ptrs[nr_keep + i] = NULL;
> +	}
> +
> +	if (*pos >= nr_keep) {
> +		*nodep = new;
> +		*pos -= nr_keep;
> +		*nr_entries = nr_move;
> +	} else {
> +		*nr_entries = nr_keep;
> +	}
> +done:
> +	for (; i < KEYS_PER_NODE; i++)
> +		new->keys[i] = XFS_IEXT_KEY_INVALID;
> +	return new;
> +}
> +
> +static void
> +xfs_iext_insert_node(
> +	struct xfs_ifork	*ifp,
> +	uint64_t		offset,
> +	void			*ptr,
> +	int			level)
> +{
> +	struct xfs_iext_node	*node, *new;
> +	int			i, pos, nr_entries;
> +
> +again:
> +	if (ifp->if_height < level)
> +		xfs_iext_grow(ifp);
> +
> +	new = NULL;
> +	node = xfs_iext_find_level(ifp, offset, level);
> +	pos = xfs_iext_node_insert_pos(node, offset);
> +	nr_entries = xfs_iext_node_nr_entries(node, pos);
> +
> +	ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
> +	ASSERT(nr_entries <= KEYS_PER_NODE);
> +
> +	if (nr_entries == KEYS_PER_NODE)
> +		new = xfs_iext_split_node(&node, &pos, &nr_entries);
> +
> +	if (node != new && pos == 0 && nr_entries > 0)
> +		xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
> +
> +	for (i = nr_entries; i > pos; i--) {
> +		node->keys[i] = node->keys[i - 1];
> +		node->ptrs[i] = node->ptrs[i - 1];
> +	}

/me wonders if memmove is more appropriate here, but I'm guessing the
loop came out of lib/btree.c?

--D

> +	node->keys[pos] = offset;
> +	node->ptrs[pos] = ptr;
> +
> +	if (new) {
> +		offset = new->keys[0];
> +		ptr = new;
> +		level++;
> +		goto again;
> +	}
> +}
> +
> +static struct xfs_iext_leaf *
> +xfs_iext_split_leaf(
> +	struct xfs_iext_cursor	*cur,
> +	int			*nr_entries)
> +{
> +	struct xfs_iext_leaf	*leaf = cur->leaf;
> +	struct xfs_iext_leaf	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
> +	const int		nr_move = RECS_PER_LEAF / 2;
> +	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
> +	int			i;
> +
> +	/* for sequential append operations just spill over into the new node */
> +	if (cur->pos == KEYS_PER_NODE) {
> +		cur->leaf = new;
> +		cur->pos = 0;
> +		*nr_entries = 0;
> +		goto done;
> +	}
> +
> +	if (nr_keep & 1)
> +		nr_keep++;
> +
> +	for (i = 0; i < nr_move; i++) {
> +		new->recs[i] = leaf->recs[nr_keep + i];
> +		xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
> +	}
> +
> +	if (cur->pos >= nr_keep) {
> +		cur->leaf = new;
> +		cur->pos -= nr_keep;
> +		*nr_entries = nr_move;
> +	} else {
> +		*nr_entries = nr_keep;
> +	}
> +done:
> +	if (leaf->next)
> +		leaf->next->prev = new;
> +	new->next = leaf->next;
> +	new->prev = leaf;
> +	leaf->next = new;
> +	return new;
> +}
> +
> +static void
> +xfs_iext_alloc_root(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	ASSERT(ifp->if_bytes == 0);
> +
> +	ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
> +	ifp->if_height = 1;
> +
> +	/* now that we have a node step into it */
> +	cur->leaf = ifp->if_u1.if_root;
> +	cur->pos = 0;
> +}
> +
> +static void
> +xfs_iext_realloc_root(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
> +	void *new;
> +
> +	/* account for the prev/next pointers */
> +	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
> +		new_size = NODE_SIZE;
> +
> +	new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
> +	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
> +	ifp->if_u1.if_root = new;
> +	cur->leaf = new;
> +}
> +
> +static void
> +__xfs_iext_insert(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_bmbt_irec	*irec)
> +{
> +	xfs_fileoff_t		offset = irec->br_startoff;
> +	struct xfs_iext_leaf	*new = NULL;
> +	int			nr_entries, i;
> +
> +	if (ifp->if_height == 0)
> +		xfs_iext_alloc_root(ifp, cur);
> +	else if (ifp->if_height == 1)
> +		xfs_iext_realloc_root(ifp, cur);
> +
> +	nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
> +	ASSERT(nr_entries <= RECS_PER_LEAF);
> +	ASSERT(cur->pos >= nr_entries ||
> +	       xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
> +
> +	if (nr_entries == RECS_PER_LEAF)
> +		new = xfs_iext_split_leaf(cur, &nr_entries);
> +
> +	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
> +		xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0), offset, 1,
> +				cur->leaf);
> +	}
> +
> +	for (i = nr_entries; i > cur->pos; i--)
> +		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
> +	xfs_iext_set(cur_rec(cur), irec);
> +	ifp->if_bytes += sizeof(struct xfs_iext_rec);
> +
> +	if (new)
> +		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
> +}
> +
> +void
> +xfs_iext_insert(
> +	struct xfs_inode	*ip,
> +	struct xfs_iext_cursor	*cur,
> +	xfs_extnum_t		nr_extents,
> +	struct xfs_bmbt_irec	*new,
> +	int			state)
> +{
> +	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
> +	int			i;
> +
> +	ASSERT(nr_extents > 0);
> +
> +	for (i = nr_extents - 1; i >= 0; i--) {
> +		__xfs_iext_insert(ifp, cur, new + i);
> +		trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
> +	}
> +}
> +
> +static struct xfs_iext_node *
> +xfs_iext_rebalance_node(
> +	struct xfs_iext_node	*parent,
> +	int			*pos,
> +	struct xfs_iext_node	*node,
> +	int			nr_entries)
> +{
> +	if (nr_entries == 0)
> +		return node;
> +
> +	if (*pos > 0) {
> +		struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
> +		int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
> +
> +		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
> +			for (i = 0; i < nr_entries; i++) {
> +				prev->keys[nr_prev + i] = node->keys[i];
> +				prev->ptrs[nr_prev + i] = node->ptrs[i];
> +			}
> +			return node;
> +		}
> +	}
> +
> +	if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
> +		struct xfs_iext_node *next = parent->ptrs[*pos + 1];
> +		int nr_next = xfs_iext_node_nr_entries(next, 0), i;
> +
> +		if (nr_entries + nr_next <= KEYS_PER_NODE) {
> +			for (i = 0; i < nr_next; i++) {
> +				node->keys[nr_entries + i] = next->keys[i];
> +				node->ptrs[nr_entries + i] = next->ptrs[i];
> +			}
> +
> +			++*pos;
> +			return next;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static void
> +xfs_iext_remove_node(
> +	struct xfs_ifork	*ifp,
> +	xfs_fileoff_t		offset,
> +	void			*victim)
> +{
> +	struct xfs_iext_node	*node, *parent;
> +	int			level = 2, pos, nr_entries, i;
> +
> +	ASSERT(level <= ifp->if_height);
> +	node = xfs_iext_find_level(ifp, offset, level);
> +	pos = xfs_iext_node_pos(node, offset);
> +again:
> +	ASSERT(node->ptrs[pos]);
> +	ASSERT(node->ptrs[pos] == victim);
> +	kmem_free(victim);
> +
> +	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
> +	offset = node->keys[0];
> +	for (i = pos; i < nr_entries; i++) {
> +		node->keys[i] = node->keys[i + 1];
> +		node->ptrs[i] = node->ptrs[i + 1];
> +	}
> +	node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
> +	node->ptrs[nr_entries] = NULL;
> +
> +	if (pos == 0 && nr_entries > 0) {
> +		xfs_iext_update_node(ifp, offset, node->keys[0], level,
> +				node);
> +		offset = node->keys[0];
> +	}
> +
> +	if (nr_entries >= KEYS_PER_NODE / 2)
> +		return;
> +
> +	if (level < ifp->if_height) {
> +		level++;
> +		parent = xfs_iext_find_level(ifp, offset, level);
> +		pos = xfs_iext_node_pos(parent, offset);
> +
> +		ASSERT(pos != KEYS_PER_NODE);
> +		ASSERT(parent->ptrs[pos] == node);
> +
> +		node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
> +		if (node) {
> +			offset = node->keys[0];
> +			victim = node;
> +			node = parent;
> +			goto again;
> +		}
> +	} else if (nr_entries == 1) {
> +		ASSERT(node == ifp->if_u1.if_root);
> +		ifp->if_u1.if_root = node->ptrs[0];
> +		ifp->if_height--;
> +		kmem_free(node);
> +	}
> +}
> +
> +static void
> +xfs_iext_rebalance_leaf(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_iext_leaf	*leaf,
> +	xfs_fileoff_t		offset,
> +	int			fill)
> +{
> +	if (leaf->prev) {
> +		int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
> +
> +		if (nr_prev + fill <= RECS_PER_LEAF) {
> +			for (i = 0; i < fill; i++)
> +				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
> +
> +			if (cur->leaf == leaf) {
> +				cur->leaf = leaf->prev;
> +				cur->pos += nr_prev;
> +			}
> +			goto remove_node;
> +		}
> +	}
> +
> +	if (leaf->next) {
> +		int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
> +
> +		if (fill + nr_next <= RECS_PER_LEAF) {
> +			for (i = 0; i < nr_next; i++)
> +				leaf->recs[fill + i] = leaf->next->recs[i];
> +
> +			if (cur->leaf == leaf->next) {
> +				cur->leaf = leaf;
> +				cur->pos += fill;
> +			}
> +
> +			offset = xfs_iext_leaf_key(leaf->next, 0);
> +			leaf = leaf->next;
> +			goto remove_node;
> +		}
> +	}
> +
> +	return;
> +remove_node:
> +	if (leaf->prev)
> +		leaf->prev->next = leaf->next;
> +	if (leaf->next)
> +		leaf->next->prev = leaf->prev;
> +	xfs_iext_remove_node(ifp, offset, leaf);
> +}
> +
> +static void
> +xfs_iext_free_last_leaf(
> +	struct xfs_ifork	*ifp)
> +{
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height--;
> +	kmem_free(ifp->if_u1.if_root);
> +}
> +
> +static void
> +__xfs_iext_remove(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur)
> +{
> +	struct xfs_iext_leaf	*leaf = cur->leaf;
> +	xfs_fileoff_t		offset = xfs_iext_leaf_key(leaf, 0);
> +	int			i, nr_entries;
> +
> +	ASSERT(ifp->if_height > 0);
> +	ASSERT(ifp->if_u1.if_root != NULL);
> +	ASSERT(xfs_iext_valid(ifp, cur));
> +
> +	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
> +	for (i = cur->pos; i < nr_entries; i++)
> +		leaf->recs[i] = leaf->recs[i + 1];
> +	xfs_iext_rec_clear(&leaf->recs[nr_entries]);
> +	ifp->if_bytes -= sizeof(struct xfs_iext_rec);
> +
> +	if (cur->pos == 0 && nr_entries > 0) {
> +		xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
> +				leaf);
> +		offset = xfs_iext_leaf_key(leaf, 0);
> +	} else if (cur->pos == nr_entries) {
> +		if (ifp->if_height > 1 && leaf->next)
> +			cur->leaf = leaf->next;
> +		else
> +			cur->leaf = NULL;
> +		cur->pos = 0;
> +	}
> +
> +	if (nr_entries >= RECS_PER_LEAF / 2)
> +		return;
> +
> +	if (ifp->if_height > 1)
> +		xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
> +	else if (nr_entries == 0)
> +		xfs_iext_free_last_leaf(ifp);
> +}
> +
> +void
> +xfs_iext_remove(
> +	struct xfs_inode	*ip,
> +	struct xfs_iext_cursor	*cur,
> +	int			nr_extents,
> +	int			state)
> +{
> +	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
> +	int			i;
> +
> +	ASSERT(nr_extents > 0);
> +
> +	for (i = 0; i < nr_extents; i++) {
> +		trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
> +		__xfs_iext_remove(ifp, cur);
> +	}
> +}
> +
> +/*
> + * Lookup the extent covering bno.
> + *
> + * If there is an extent covering bno return the extent index, and store the
> + * expanded extent structure in *gotp, and the extent cursor in *cur.
> + * If there is no extent covering bno, but there is an extent after it (e.g.
> + * it lies in a hole) return that extent in *gotp and its cursor in *cur
> + * instead.
> + * If bno is beyond the last extent return false, and return an invalid
> + * cursor value.
> + */
> +bool
> +xfs_iext_lookup_extent(
> +	struct xfs_inode	*ip,
> +	struct xfs_ifork	*ifp,
> +	xfs_fileoff_t		offset,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_bmbt_irec	*gotp)
> +{
> +	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
> +
> +	cur->leaf = xfs_iext_find_level(ifp, offset, 1);
> +	if (!cur->leaf) {
> +		cur->pos = 0;
> +		return false;
> +	}
> +
> +	for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
> +		struct xfs_iext_rec *rec = cur_rec(cur);
> +
> +		if (xfs_iext_rec_is_empty(rec))
> +			break;
> +		if (xfs_iext_rec_cmp(rec, offset) >= 0)
> +			goto found;
> +	}
> +
> +	/* Try looking in the next node for an entry > offset */
> +	if (ifp->if_height == 1 || !cur->leaf->next)
> +		return false;
> +	cur->leaf = cur->leaf->next;
> +	cur->pos = 0;
> +	if (!xfs_iext_valid(ifp, cur))
> +		return false;
> +found:
> +	xfs_iext_get(gotp, cur_rec(cur));
> +	return true;
> +}
> +
> +/*
> + * Returns the last extent before end, and if this extent doesn't cover
> + * end, update end to the end of the extent.
> + */
> +bool
> +xfs_iext_lookup_extent_before(
> +	struct xfs_inode	*ip,
> +	struct xfs_ifork	*ifp,
> +	xfs_fileoff_t		*end,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_bmbt_irec	*gotp)
> +{
> +	/* could be optimized to not even look up the next on a match.. */
> +	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
> +	    gotp->br_startoff <= *end - 1)
> +		return true;
> +	if (!xfs_iext_prev_extent(ifp, cur, gotp))
> +		return false;
> +	*end = gotp->br_startoff + gotp->br_blockcount;
> +	return true;
> +}
> +
> +void
> +xfs_iext_update_extent(
> +	struct xfs_inode	*ip,
> +	int			state,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_bmbt_irec	*new)
> +{
> +	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
> +
> +	if (cur->pos == 0) {
> +		struct xfs_bmbt_irec	old;
> +
> +		xfs_iext_get(&old, cur_rec(cur));
> +		if (new->br_startoff != old.br_startoff) {
> +			xfs_iext_update_node(ifp, old.br_startoff,
> +					new->br_startoff, 1, cur->leaf);
> +		}
> +	}
> +
> +	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
> +	xfs_iext_set(cur_rec(cur), new);
> +	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
> +}
> +
> +/*
> + * Return true if there is an extent at cursor cur and return the expanded
> + * extent structure at cur in gotp in that case.  Else return false.
> + */
> +bool
> +xfs_iext_get_extent(
> +	struct xfs_ifork	*ifp,
> +	struct xfs_iext_cursor	*cur,
> +	struct xfs_bmbt_irec	*gotp)
> +{
> +	if (!xfs_iext_valid(ifp, cur))
> +		return false;
> +	xfs_iext_get(gotp, cur_rec(cur));
> +	return true;
> +}
> +
> +/*
> + * This is a recursive function, because of that we need to be extremely
> + * careful with stack usage.
> + */
> +static void
> +xfs_iext_destroy_node(
> +	struct xfs_iext_node	*node,
> +	int			level)
> +{
> +	int			i;
> +
> +	if (level > 1) {
> +		for (i = 0; i < KEYS_PER_NODE; i++) {
> +			if (node->keys[i] == XFS_IEXT_KEY_INVALID)
> +				break;
> +			xfs_iext_destroy_node(node->ptrs[i], level - 1);
> +		}
> +	}
> +
> +	kmem_free(node);
> +}
> +
> +void
> +xfs_iext_destroy(
> +	struct xfs_ifork	*ifp)
> +{
> +	xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
> +
> +	ifp->if_bytes = 0;
> +	ifp->if_height = 0;
> +	ifp->if_u1.if_root = NULL;
> +}
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> index 5ac341d2b093..2711ff6ab2b3 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.c
> +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> @@ -331,6 +331,7 @@ xfs_iformat_extents(
>  	int			size = nex * sizeof(xfs_bmbt_rec_t);
>  	struct xfs_iext_cursor	ext;
>  	struct xfs_bmbt_rec	*dp;
> +	struct xfs_bmbt_irec	new;
>  	int			i;
>  
>  	/*
> @@ -346,27 +347,22 @@ xfs_iformat_extents(
>  	}
>  
>  	ifp->if_real_bytes = 0;
> -	if (nex == 0)
> -		ifp->if_u1.if_extents = NULL;
> -	else
> -		xfs_iext_add(ifp, 0, nex);
> -
> -	ifp->if_bytes = size;
> +	ifp->if_bytes = 0;
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height = 0;
>  	if (size) {
>  		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
>  
>  		xfs_iext_first(ifp, &ext);
>  		for (i = 0; i < nex; i++, dp++) {
> -			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
> -
>  			if (!xfs_bmbt_validate_extent(mp, whichfork, dp)) {
>  				XFS_ERROR_REPORT("xfs_iformat_extents(2)",
>  						 XFS_ERRLEVEL_LOW, mp);
>  				return -EFSCORRUPTED;
>  			}
>  
> -			ep->l0 = get_unaligned_be64(&dp->l0);
> -			ep->l1 = get_unaligned_be64(&dp->l1);
> +			xfs_bmbt_disk_get_all(dp, &new);
> +			xfs_iext_insert(ip, &ext, 1, &new, state);
>  			trace_xfs_read_extent(ip, &ext, state, _THIS_IP_);
>  			xfs_iext_next(ifp, &ext);
>  		}
> @@ -435,6 +431,10 @@ xfs_iformat_btree(
>  	ifp->if_flags &= ~XFS_IFEXTENTS;
>  	ifp->if_flags |= XFS_IFBROOT;
>  
> +	ifp->if_real_bytes = 0;
> +	ifp->if_bytes = 0;
> +	ifp->if_u1.if_root = NULL;
> +	ifp->if_height = 0;
>  	return 0;
>  }
>  
> @@ -662,14 +662,12 @@ xfs_idestroy_fork(
>  			ifp->if_u1.if_data = NULL;
>  			ifp->if_real_bytes = 0;
>  		}
> -	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
> -		   ((ifp->if_flags & XFS_IFEXTIREC) ||
> -		    (ifp->if_u1.if_extents != NULL))) {
> -		ASSERT(ifp->if_real_bytes != 0);
> +	} else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) {
>  		xfs_iext_destroy(ifp);
>  	}
> -	ASSERT(ifp->if_u1.if_extents == NULL);
> +
>  	ASSERT(ifp->if_real_bytes == 0);
> +
>  	if (whichfork == XFS_ATTR_FORK) {
>  		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
>  		ip->i_afp = NULL;
> @@ -679,13 +677,6 @@ xfs_idestroy_fork(
>  	}
>  }
>  
> -/* Count number of incore extents based on if_bytes */
> -xfs_extnum_t
> -xfs_iext_count(struct xfs_ifork *ifp)
> -{
> -	return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
> -}
> -
>  /*
>   * Convert in-core extents to on-disk form
>   *
> @@ -780,7 +771,6 @@ xfs_iflush_fork(
>  		       !(iip->ili_fields & extflag[whichfork]));
>  		if ((iip->ili_fields & extflag[whichfork]) &&
>  		    (ifp->if_bytes > 0)) {
> -			ASSERT(xfs_iext_get_ext(ifp, 0));
>  			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
>  			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
>  				whichfork);
> @@ -812,33 +802,6 @@ xfs_iflush_fork(
>  	}
>  }
>  
> -/*
> - * Return a pointer to the extent record at file index idx.
> - */
> -xfs_bmbt_rec_host_t *
> -xfs_iext_get_ext(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_extnum_t	idx)		/* index of target extent */
> -{
> -	ASSERT(idx >= 0);
> -	ASSERT(idx < xfs_iext_count(ifp));
> -
> -	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
> -		return ifp->if_u1.if_ext_irec->er_extbuf;
> -	} else if (ifp->if_flags & XFS_IFEXTIREC) {
> -		xfs_ext_irec_t	*erp;		/* irec pointer */
> -		int		erp_idx = 0;	/* irec index */
> -		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
> -
> -		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
> -		return &erp->er_extbuf[page_idx];
> -	} else if (ifp->if_bytes) {
> -		return &ifp->if_u1.if_extents[idx];
> -	} else {
> -		return NULL;
> -	}
> -}
> -
>  /* Convert bmap state flags to an inode fork. */
>  struct xfs_ifork *
>  xfs_iext_state_to_fork(
> @@ -852,894 +815,6 @@ xfs_iext_state_to_fork(
>  	return &ip->i_df;
>  }
>  
> -/*
> - * Insert new item(s) into the extent records for incore inode
> - * fork 'ifp'.  'count' new items are inserted at index 'idx'.
> - */
> -void
> -xfs_iext_insert(
> -	xfs_inode_t	*ip,		/* incore inode pointer */
> -	struct xfs_iext_cursor *cur,
> -	xfs_extnum_t	count,		/* number of inserted items */
> -	xfs_bmbt_irec_t	*new,		/* items to insert */
> -	int		state)		/* type of extent conversion */
> -{
> -	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
> -	xfs_extnum_t	i;		/* extent record index */
> -
> -	trace_xfs_iext_insert(ip, cur->idx, new, state, _RET_IP_);
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
> -	xfs_iext_add(ifp, cur->idx, count);
> -	for (i = 0; i < count; i++, new++)
> -		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, cur->idx + i), new);
> -}
> -
> -/*
> - * This is called when the amount of space required for incore file
> - * extents needs to be increased. The ext_diff parameter stores the
> - * number of new extents being added and the idx parameter contains
> - * the extent index where the new extents will be added. If the new
> - * extents are being appended, then we just need to (re)allocate and
> - * initialize the space. Otherwise, if the new extents are being
> - * inserted into the middle of the existing entries, a bit more work
> - * is required to make room for the new extents to be inserted. The
> - * caller is responsible for filling in the new extent entries upon
> - * return.
> - */
> -void
> -xfs_iext_add(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_extnum_t	idx,		/* index to begin adding exts */
> -	int		ext_diff)	/* number of extents to add */
> -{
> -	int		byte_diff;	/* new bytes being added */
> -	int		new_size;	/* size of extents after adding */
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -
> -	nextents = xfs_iext_count(ifp);
> -	ASSERT((idx >= 0) && (idx <= nextents));
> -	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
> -	new_size = ifp->if_bytes + byte_diff;
> -
> -	/*
> -	 * Use a linear (direct) extent list.
> -	 * If the extents are currently inside the inode,
> -	 * xfs_iext_realloc_direct will switch us from
> -	 * inline to direct extent allocation mode.
> -	 */
> -	if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
> -		xfs_iext_realloc_direct(ifp, new_size);
> -		if (idx < nextents) {
> -			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
> -				&ifp->if_u1.if_extents[idx],
> -				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
> -			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
> -		}
> -	}
> -	/* Indirection array */
> -	else {
> -		xfs_ext_irec_t	*erp;
> -		int		erp_idx = 0;
> -		int		page_idx = idx;
> -
> -		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
> -		if (ifp->if_flags & XFS_IFEXTIREC) {
> -			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
> -		} else {
> -			xfs_iext_irec_init(ifp);
> -			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -			erp = ifp->if_u1.if_ext_irec;
> -		}
> -		/* Extents fit in target extent page */
> -		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
> -			if (page_idx < erp->er_extcount) {
> -				memmove(&erp->er_extbuf[page_idx + ext_diff],
> -					&erp->er_extbuf[page_idx],
> -					(erp->er_extcount - page_idx) *
> -					sizeof(xfs_bmbt_rec_t));
> -				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
> -			}
> -			erp->er_extcount += ext_diff;
> -			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
> -		}
> -		/* Insert a new extent page */
> -		else if (erp) {
> -			xfs_iext_add_indirect_multi(ifp,
> -				erp_idx, page_idx, ext_diff);
> -		}
> -		/*
> -		 * If extent(s) are being appended to the last page in
> -		 * the indirection array and the new extent(s) don't fit
> -		 * in the page, then erp is NULL and erp_idx is set to
> -		 * the next index needed in the indirection array.
> -		 */
> -		else {
> -			uint	count = ext_diff;
> -
> -			while (count) {
> -				erp = xfs_iext_irec_new(ifp, erp_idx);
> -				erp->er_extcount = min(count, XFS_LINEAR_EXTS);
> -				count -= erp->er_extcount;
> -				if (count)
> -					erp_idx++;
> -			}
> -		}
> -	}
> -	ifp->if_bytes = new_size;
> -}
> -
> -/*
> - * This is called when incore extents are being added to the indirection
> - * array and the new extents do not fit in the target extent list. The
> - * erp_idx parameter contains the irec index for the target extent list
> - * in the indirection array, and the idx parameter contains the extent
> - * index within the list. The number of extents being added is stored
> - * in the count parameter.
> - *
> - *    |-------|   |-------|
> - *    |       |   |       |    idx - number of extents before idx
> - *    |  idx  |   | count |
> - *    |       |   |       |    count - number of extents being inserted at idx
> - *    |-------|   |-------|
> - *    | count |   | nex2  |    nex2 - number of extents after idx + count
> - *    |-------|   |-------|
> - */
> -void
> -xfs_iext_add_indirect_multi(
> -	xfs_ifork_t	*ifp,			/* inode fork pointer */
> -	int		erp_idx,		/* target extent irec index */
> -	xfs_extnum_t	idx,			/* index within target list */
> -	int		count)			/* new extents being added */
> -{
> -	int		byte_diff;		/* new bytes being added */
> -	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
> -	xfs_extnum_t	ext_diff;		/* number of extents to add */
> -	xfs_extnum_t	ext_cnt;		/* new extents still needed */
> -	xfs_extnum_t	nex2;			/* extents after idx + count */
> -	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
> -	int		nlists;			/* number of irec's (lists) */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -	nex2 = erp->er_extcount - idx;
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -
> -	/*
> -	 * Save second part of target extent list
> -	 * (all extents past */
> -	if (nex2) {
> -		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
> -		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
> -		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
> -		erp->er_extcount -= nex2;
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
> -		memset(&erp->er_extbuf[idx], 0, byte_diff);
> -	}
> -
> -	/*
> -	 * Add the new extents to the end of the target
> -	 * list, then allocate new irec record(s) and
> -	 * extent buffer(s) as needed to store the rest
> -	 * of the new extents.
> -	 */
> -	ext_cnt = count;
> -	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
> -	if (ext_diff) {
> -		erp->er_extcount += ext_diff;
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
> -		ext_cnt -= ext_diff;
> -	}
> -	while (ext_cnt) {
> -		erp_idx++;
> -		erp = xfs_iext_irec_new(ifp, erp_idx);
> -		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
> -		erp->er_extcount = ext_diff;
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
> -		ext_cnt -= ext_diff;
> -	}
> -
> -	/* Add nex2 extents back to indirection array */
> -	if (nex2) {
> -		xfs_extnum_t	ext_avail;
> -		int		i;
> -
> -		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
> -		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
> -		i = 0;
> -		/*
> -		 * If nex2 extents fit in the current page, append
> -		 * nex2_ep after the new extents.
> -		 */
> -		if (nex2 <= ext_avail) {
> -			i = erp->er_extcount;
> -		}
> -		/*
> -		 * Otherwise, check if space is available in the
> -		 * next page.
> -		 */
> -		else if ((erp_idx < nlists - 1) &&
> -			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
> -			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
> -			erp_idx++;
> -			erp++;
> -			/* Create a hole for nex2 extents */
> -			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
> -				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
> -		}
> -		/*
> -		 * Final choice, create a new extent page for
> -		 * nex2 extents.
> -		 */
> -		else {
> -			erp_idx++;
> -			erp = xfs_iext_irec_new(ifp, erp_idx);
> -		}
> -		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
> -		kmem_free(nex2_ep);
> -		erp->er_extcount += nex2;
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
> -	}
> -}
> -
> -/*
> - * This is called when the amount of space required for incore file
> - * extents needs to be decreased. The ext_diff parameter stores the
> - * number of extents to be removed and the idx parameter contains
> - * the extent index where the extents will be removed from.
> - *
> - * If the amount of space needed has decreased below the linear
> - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
> - * extent array.  Otherwise, use kmem_realloc() to adjust the
> - * size to what is needed.
> - */
> -void
> -xfs_iext_remove(
> -	xfs_inode_t	*ip,		/* incore inode pointer */
> -	struct xfs_iext_cursor *cur,
> -	int		ext_diff,	/* number of extents to remove */
> -	int		state)		/* type of extent conversion */
> -{
> -	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -	int		new_size;	/* size of extents after removal */
> -
> -	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
> -
> -	ASSERT(ext_diff > 0);
> -	nextents = xfs_iext_count(ifp);
> -	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
> -
> -	if (new_size == 0) {
> -		xfs_iext_destroy(ifp);
> -	} else if (ifp->if_flags & XFS_IFEXTIREC) {
> -		xfs_iext_remove_indirect(ifp, cur->idx, ext_diff);
> -	} else if (ifp->if_real_bytes) {
> -		xfs_iext_remove_direct(ifp, cur->idx, ext_diff);
> -	}
> -	ifp->if_bytes = new_size;
> -}
> -
> -/*
> - * This removes ext_diff extents from a linear (direct) extent list,
> - * beginning at extent index idx. If the extents are being removed
> - * from the end of the list (ie. truncate) then we just need to re-
> - * allocate the list to remove the extra space. Otherwise, if the
> - * extents are being removed from the middle of the existing extent
> - * entries, then we first need to move the extent records beginning
> - * at idx + ext_diff up in the list to overwrite the records being
> - * removed, then remove the extra space via kmem_realloc.
> - */
> -void
> -xfs_iext_remove_direct(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_extnum_t	idx,		/* index to begin removing exts */
> -	int		ext_diff)	/* number of extents to remove */
> -{
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -	int		new_size;	/* size of extents after removal */
> -
> -	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
> -	new_size = ifp->if_bytes -
> -		(ext_diff * sizeof(xfs_bmbt_rec_t));
> -	nextents = xfs_iext_count(ifp);
> -
> -	if (new_size == 0) {
> -		xfs_iext_destroy(ifp);
> -		return;
> -	}
> -	/* Move extents up in the list (if needed) */
> -	if (idx + ext_diff < nextents) {
> -		memmove(&ifp->if_u1.if_extents[idx],
> -			&ifp->if_u1.if_extents[idx + ext_diff],
> -			(nextents - (idx + ext_diff)) *
> -			 sizeof(xfs_bmbt_rec_t));
> -	}
> -	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
> -		0, ext_diff * sizeof(xfs_bmbt_rec_t));
> -	/*
> -	 * Reallocate the direct extent list. If the extents
> -	 * will fit inside the inode then xfs_iext_realloc_direct
> -	 * will switch from direct to inline extent allocation
> -	 * mode for us.
> -	 */
> -	xfs_iext_realloc_direct(ifp, new_size);
> -	ifp->if_bytes = new_size;
> -}
> -
> -/*
> - * This is called when incore extents are being removed from the
> - * indirection array and the extents being removed span multiple extent
> - * buffers. The idx parameter contains the file extent index where we
> - * want to begin removing extents, and the count parameter contains
> - * how many extents need to be removed.
> - *
> - *    |-------|   |-------|
> - *    | nex1  |   |       |    nex1 - number of extents before idx
> - *    |-------|   | count |
> - *    |       |   |       |    count - number of extents being removed at idx
> - *    | count |   |-------|
> - *    |       |   | nex2  |    nex2 - number of extents after idx + count
> - *    |-------|   |-------|
> - */
> -void
> -xfs_iext_remove_indirect(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_extnum_t	idx,		/* index to begin removing extents */
> -	int		count)		/* number of extents to remove */
> -{
> -	xfs_ext_irec_t	*erp;		/* indirection array pointer */
> -	int		erp_idx = 0;	/* indirection array index */
> -	xfs_extnum_t	ext_cnt;	/* extents left to remove */
> -	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
> -	xfs_extnum_t	nex1;		/* number of extents before idx */
> -	xfs_extnum_t	nex2;		/* extents after idx + count */
> -	int		page_idx = idx;	/* index in target extent list */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
> -	ASSERT(erp != NULL);
> -	nex1 = page_idx;
> -	ext_cnt = count;
> -	while (ext_cnt) {
> -		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
> -		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
> -		/*
> -		 * Check for deletion of entire list;
> -		 * xfs_iext_irec_remove() updates extent offsets.
> -		 */
> -		if (ext_diff == erp->er_extcount) {
> -			xfs_iext_irec_remove(ifp, erp_idx);
> -			ext_cnt -= ext_diff;
> -			nex1 = 0;
> -			if (ext_cnt) {
> -				ASSERT(erp_idx < ifp->if_real_bytes /
> -					XFS_IEXT_BUFSZ);
> -				erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -				nex1 = 0;
> -				continue;
> -			} else {
> -				break;
> -			}
> -		}
> -		/* Move extents up (if needed) */
> -		if (nex2) {
> -			memmove(&erp->er_extbuf[nex1],
> -				&erp->er_extbuf[nex1 + ext_diff],
> -				nex2 * sizeof(xfs_bmbt_rec_t));
> -		}
> -		/* Zero out rest of page */
> -		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
> -			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
> -		/* Update remaining counters */
> -		erp->er_extcount -= ext_diff;
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
> -		ext_cnt -= ext_diff;
> -		nex1 = 0;
> -		erp_idx++;
> -		erp++;
> -	}
> -	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
> -	xfs_iext_irec_compact(ifp);
> -}
> -
> -/*
> - * Create, destroy, or resize a linear (direct) block of extents.
> - */
> -void
> -xfs_iext_realloc_direct(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	int		new_size)	/* new size of extents after adding */
> -{
> -	int		rnew_size;	/* real new size of extents */
> -
> -	rnew_size = new_size;
> -
> -	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
> -		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
> -		 (new_size != ifp->if_real_bytes)));
> -
> -	/* Free extent records */
> -	if (new_size == 0) {
> -		xfs_iext_destroy(ifp);
> -	} else {
> -		if (!is_power_of_2(new_size)){
> -			rnew_size = roundup_pow_of_two(new_size);
> -		}
> -		if (rnew_size != ifp->if_real_bytes) {
> -			ifp->if_u1.if_extents =
> -				kmem_realloc(ifp->if_u1.if_extents,
> -						rnew_size, KM_NOFS);
> -		}
> -		if (rnew_size > ifp->if_real_bytes) {
> -			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
> -				(uint)sizeof(xfs_bmbt_rec_t)], 0,
> -				rnew_size - ifp->if_real_bytes);
> -		}
> -	}
> -	ifp->if_real_bytes = rnew_size;
> -	ifp->if_bytes = new_size;
> -}
> -
> -/*
> - * Resize an extent indirection array to new_size bytes.
> - */
> -STATIC void
> -xfs_iext_realloc_indirect(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	int		new_size)	/* new indirection array size */
> -{
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	ASSERT(ifp->if_real_bytes);
> -	ASSERT((new_size >= 0) &&
> -	       (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
> -			     sizeof(xfs_ext_irec_t))));
> -	if (new_size == 0) {
> -		xfs_iext_destroy(ifp);
> -	} else {
> -		ifp->if_u1.if_ext_irec =
> -			kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
> -	}
> -}
> -
> -/*
> - * Switch from indirection array to linear (direct) extent allocations.
> - */
> -STATIC void
> -xfs_iext_indirect_to_direct(
> -	 xfs_ifork_t	*ifp)		/* inode fork pointer */
> -{
> -	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -	int		size;		/* size of file extents */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nextents = xfs_iext_count(ifp);
> -	ASSERT(nextents <= XFS_LINEAR_EXTS);
> -	size = nextents * sizeof(xfs_bmbt_rec_t);
> -
> -	xfs_iext_irec_compact_pages(ifp);
> -	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
> -
> -	ep = ifp->if_u1.if_ext_irec->er_extbuf;
> -	kmem_free(ifp->if_u1.if_ext_irec);
> -	ifp->if_flags &= ~XFS_IFEXTIREC;
> -	ifp->if_u1.if_extents = ep;
> -	ifp->if_bytes = size;
> -	if (nextents < XFS_LINEAR_EXTS) {
> -		xfs_iext_realloc_direct(ifp, size);
> -	}
> -}
> -
> -/*
> - * Remove all records from the indirection array.
> - */
> -STATIC void
> -xfs_iext_irec_remove_all(
> -	struct xfs_ifork *ifp)
> -{
> -	int		nlists;
> -	int		i;
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	for (i = 0; i < nlists; i++)
> -		kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
> -	kmem_free(ifp->if_u1.if_ext_irec);
> -	ifp->if_flags &= ~XFS_IFEXTIREC;
> -}
> -
> -/*
> - * Free incore file extents.
> - */
> -void
> -xfs_iext_destroy(
> -	xfs_ifork_t	*ifp)		/* inode fork pointer */
> -{
> -	if (ifp->if_flags & XFS_IFEXTIREC) {
> -		xfs_iext_irec_remove_all(ifp);
> -	} else if (ifp->if_real_bytes) {
> -		kmem_free(ifp->if_u1.if_extents);
> -	}
> -	ifp->if_u1.if_extents = NULL;
> -	ifp->if_real_bytes = 0;
> -	ifp->if_bytes = 0;
> -}
> -
> -/*
> - * Return a pointer to the extent record for file system block bno.
> - */
> -xfs_bmbt_rec_host_t *			/* pointer to found extent record */
> -xfs_iext_bno_to_ext(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_fileoff_t	bno,		/* block number to search for */
> -	xfs_extnum_t	*idxp)		/* index of target extent */
> -{
> -	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
> -	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
> -	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
> -	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
> -	int		high;		/* upper boundary in search */
> -	xfs_extnum_t	idx = 0;	/* index of target extent */
> -	int		low;		/* lower boundary in search */
> -	xfs_extnum_t	nextents;	/* number of file extents */
> -	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
> -
> -	nextents = xfs_iext_count(ifp);
> -	if (nextents == 0) {
> -		*idxp = 0;
> -		return NULL;
> -	}
> -	low = 0;
> -	if (ifp->if_flags & XFS_IFEXTIREC) {
> -		/* Find target extent list */
> -		int	erp_idx = 0;
> -		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
> -		base = erp->er_extbuf;
> -		high = erp->er_extcount - 1;
> -	} else {
> -		base = ifp->if_u1.if_extents;
> -		high = nextents - 1;
> -	}
> -	/* Binary search extent records */
> -	while (low <= high) {
> -		idx = (low + high) >> 1;
> -		ep = base + idx;
> -		startoff = xfs_bmbt_get_startoff(ep);
> -		blockcount = xfs_bmbt_get_blockcount(ep);
> -		if (bno < startoff) {
> -			high = idx - 1;
> -		} else if (bno >= startoff + blockcount) {
> -			low = idx + 1;
> -		} else {
> -			/* Convert back to file-based extent index */
> -			if (ifp->if_flags & XFS_IFEXTIREC) {
> -				idx += erp->er_extoff;
> -			}
> -			*idxp = idx;
> -			return ep;
> -		}
> -	}
> -	/* Convert back to file-based extent index */
> -	if (ifp->if_flags & XFS_IFEXTIREC) {
> -		idx += erp->er_extoff;
> -	}
> -	if (bno >= startoff + blockcount) {
> -		if (++idx == nextents) {
> -			ep = NULL;
> -		} else {
> -			ep = xfs_iext_get_ext(ifp, idx);
> -		}
> -	}
> -	*idxp = idx;
> -	return ep;
> -}
> -
> -/*
> - * Return a pointer to the indirection array entry containing the
> - * extent record for filesystem block bno. Store the index of the
> - * target irec in *erp_idxp.
> - */
> -xfs_ext_irec_t *			/* pointer to found extent record */
> -xfs_iext_bno_to_irec(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_fileoff_t	bno,		/* block number to search for */
> -	int		*erp_idxp)	/* irec index of target ext list */
> -{
> -	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
> -	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
> -	int		erp_idx;	/* indirection array index */
> -	int		nlists;		/* number of extent irec's (lists) */
> -	int		high;		/* binary search upper limit */
> -	int		low;		/* binary search lower limit */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	erp_idx = 0;
> -	low = 0;
> -	high = nlists - 1;
> -	while (low <= high) {
> -		erp_idx = (low + high) >> 1;
> -		erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
> -		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
> -			high = erp_idx - 1;
> -		} else if (erp_next && bno >=
> -			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
> -			low = erp_idx + 1;
> -		} else {
> -			break;
> -		}
> -	}
> -	*erp_idxp = erp_idx;
> -	return erp;
> -}
> -
> -/*
> - * Return a pointer to the indirection array entry containing the
> - * extent record at file extent index *idxp. Store the index of the
> - * target irec in *erp_idxp and store the page index of the target
> - * extent record in *idxp.
> - */
> -xfs_ext_irec_t *
> -xfs_iext_idx_to_irec(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
> -	int		*erp_idxp,	/* pointer to target irec */
> -	int		realloc)	/* new bytes were just added */
> -{
> -	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
> -	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
> -	int		erp_idx;	/* indirection array index */
> -	int		nlists;		/* number of irec's (ex lists) */
> -	int		high;		/* binary search upper limit */
> -	int		low;		/* binary search lower limit */
> -	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	ASSERT(page_idx >= 0);
> -	ASSERT(page_idx <= xfs_iext_count(ifp));
> -	ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
> -
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	erp_idx = 0;
> -	low = 0;
> -	high = nlists - 1;
> -
> -	/* Binary search extent irec's */
> -	while (low <= high) {
> -		erp_idx = (low + high) >> 1;
> -		erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -		prev = erp_idx > 0 ? erp - 1 : NULL;
> -		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
> -		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
> -			high = erp_idx - 1;
> -		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
> -			   (page_idx == erp->er_extoff + erp->er_extcount &&
> -			    !realloc)) {
> -			low = erp_idx + 1;
> -		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
> -			   erp->er_extcount == XFS_LINEAR_EXTS) {
> -			ASSERT(realloc);
> -			page_idx = 0;
> -			erp_idx++;
> -			erp = erp_idx < nlists ? erp + 1 : NULL;
> -			break;
> -		} else {
> -			page_idx -= erp->er_extoff;
> -			break;
> -		}
> -	}
> -	*idxp = page_idx;
> -	*erp_idxp = erp_idx;
> -	return erp;
> -}
> -
> -/*
> - * Allocate and initialize an indirection array once the space needed
> - * for incore extents increases above XFS_IEXT_BUFSZ.
> - */
> -void
> -xfs_iext_irec_init(
> -	xfs_ifork_t	*ifp)		/* inode fork pointer */
> -{
> -	xfs_ext_irec_t	*erp;		/* indirection array pointer */
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -
> -	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
> -	nextents = xfs_iext_count(ifp);
> -	ASSERT(nextents <= XFS_LINEAR_EXTS);
> -
> -	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
> -
> -	if (nextents == 0) {
> -		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
> -	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
> -		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
> -	}
> -	erp->er_extbuf = ifp->if_u1.if_extents;
> -	erp->er_extcount = nextents;
> -	erp->er_extoff = 0;
> -
> -	ifp->if_flags |= XFS_IFEXTIREC;
> -	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
> -	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
> -	ifp->if_u1.if_ext_irec = erp;
> -
> -	return;
> -}
> -
> -/*
> - * Allocate and initialize a new entry in the indirection array.
> - */
> -xfs_ext_irec_t *
> -xfs_iext_irec_new(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	int		erp_idx)	/* index for new irec */
> -{
> -	xfs_ext_irec_t	*erp;		/* indirection array pointer */
> -	int		i;		/* loop counter */
> -	int		nlists;		/* number of irec's (ex lists) */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -
> -	/* Resize indirection array */
> -	xfs_iext_realloc_indirect(ifp, ++nlists *
> -				  sizeof(xfs_ext_irec_t));
> -	/*
> -	 * Move records down in the array so the
> -	 * new page can use erp_idx.
> -	 */
> -	erp = ifp->if_u1.if_ext_irec;
> -	for (i = nlists - 1; i > erp_idx; i--) {
> -		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
> -	}
> -	ASSERT(i == erp_idx);
> -
> -	/* Initialize new extent record */
> -	erp = ifp->if_u1.if_ext_irec;
> -	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
> -	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
> -	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
> -	erp[erp_idx].er_extcount = 0;
> -	erp[erp_idx].er_extoff = erp_idx > 0 ?
> -		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
> -	return (&erp[erp_idx]);
> -}
> -
> -/*
> - * Remove a record from the indirection array.
> - */
> -void
> -xfs_iext_irec_remove(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	int		erp_idx)	/* irec index to remove */
> -{
> -	xfs_ext_irec_t	*erp;		/* indirection array pointer */
> -	int		i;		/* loop counter */
> -	int		nlists;		/* number of irec's (ex lists) */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -	if (erp->er_extbuf) {
> -		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
> -			-erp->er_extcount);
> -		kmem_free(erp->er_extbuf);
> -	}
> -	/* Compact extent records */
> -	erp = ifp->if_u1.if_ext_irec;
> -	for (i = erp_idx; i < nlists - 1; i++) {
> -		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
> -	}
> -	/*
> -	 * Manually free the last extent record from the indirection
> -	 * array.  A call to xfs_iext_realloc_indirect() with a size
> -	 * of zero would result in a call to xfs_iext_destroy() which
> -	 * would in turn call this function again, creating a nasty
> -	 * infinite loop.
> -	 */
> -	if (--nlists) {
> -		xfs_iext_realloc_indirect(ifp,
> -			nlists * sizeof(xfs_ext_irec_t));
> -	} else {
> -		kmem_free(ifp->if_u1.if_ext_irec);
> -	}
> -	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
> -}
> -
> -/*
> - * This is called to clean up large amounts of unused memory allocated
> - * by the indirection array.  Before compacting anything though, verify
> - * that the indirection array is still needed and switch back to the
> - * linear extent list (or even the inline buffer) if possible.  The
> - * compaction policy is as follows:
> - *
> - *    Full Compaction: Extents fit into a single page (or inline buffer)
> - * Partial Compaction: Extents occupy less than 50% of allocated space
> - *      No Compaction: Extents occupy at least 50% of allocated space
> - */
> -void
> -xfs_iext_irec_compact(
> -	xfs_ifork_t	*ifp)		/* inode fork pointer */
> -{
> -	xfs_extnum_t	nextents;	/* number of extents in file */
> -	int		nlists;		/* number of irec's (ex lists) */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	nextents = xfs_iext_count(ifp);
> -
> -	if (nextents == 0) {
> -		xfs_iext_destroy(ifp);
> -	} else if (nextents <= XFS_LINEAR_EXTS) {
> -		xfs_iext_indirect_to_direct(ifp);
> -	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
> -		xfs_iext_irec_compact_pages(ifp);
> -	}
> -}
> -
> -/*
> - * Combine extents from neighboring extent pages.
> - */
> -void
> -xfs_iext_irec_compact_pages(
> -	xfs_ifork_t	*ifp)		/* inode fork pointer */
> -{
> -	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
> -	int		erp_idx = 0;	/* indirection array index */
> -	int		nlists;		/* number of irec's (ex lists) */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	while (erp_idx < nlists - 1) {
> -		erp = &ifp->if_u1.if_ext_irec[erp_idx];
> -		erp_next = erp + 1;
> -		if (erp_next->er_extcount <=
> -		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
> -			memcpy(&erp->er_extbuf[erp->er_extcount],
> -				erp_next->er_extbuf, erp_next->er_extcount *
> -				sizeof(xfs_bmbt_rec_t));
> -			erp->er_extcount += erp_next->er_extcount;
> -			/*
> -			 * Free page before removing extent record
> -			 * so er_extoffs don't get modified in
> -			 * xfs_iext_irec_remove.
> -			 */
> -			kmem_free(erp_next->er_extbuf);
> -			erp_next->er_extbuf = NULL;
> -			xfs_iext_irec_remove(ifp, erp_idx + 1);
> -			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -		} else {
> -			erp_idx++;
> -		}
> -	}
> -}
> -
> -/*
> - * This is called to update the er_extoff field in the indirection
> - * array when extents have been added or removed from one of the
> - * extent lists. erp_idx contains the irec index to begin updating
> - * at and ext_diff contains the number of extents that were added
> - * or removed.
> - */
> -void
> -xfs_iext_irec_update_extoffs(
> -	xfs_ifork_t	*ifp,		/* inode fork pointer */
> -	int		erp_idx,	/* irec index to update */
> -	int		ext_diff)	/* number of new extents */
> -{
> -	int		i;		/* loop counter */
> -	int		nlists;		/* number of irec's (ex lists */
> -
> -	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
> -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
> -	for (i = erp_idx; i < nlists; i++) {
> -		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
> -	}
> -}
> -
>  /*
>   * Initialize an inode's copy-on-write fork.
>   */
> @@ -1756,87 +831,3 @@ xfs_ifork_init_cow(
>  	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
>  	ip->i_cnextents = 0;
>  }
> -
> -/*
> - * Lookup the extent covering bno.
> - *
> - * If there is an extent covering bno return the extent index, and store the
> - * expanded extent structure in *gotp, and the extent cursor in *cur.
> - * If there is no extent covering bno, but there is an extent after it (e.g.
> - * it lies in a hole) return that extent in *gotp and its cursor in *cur
> - * instead.
> - * If bno is beyond the last extent return false, and return an invalid
> - * cursor value.
> - */
> -bool
> -xfs_iext_lookup_extent(
> -	struct xfs_inode	*ip,
> -	struct xfs_ifork	*ifp,
> -	xfs_fileoff_t		bno,
> -	struct xfs_iext_cursor	*cur,
> -	struct xfs_bmbt_irec	*gotp)
> -{
> -	struct xfs_bmbt_rec_host *ep;
> -
> -	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
> -
> -	ep = xfs_iext_bno_to_ext(ifp, bno, &cur->idx);
> -	if (!ep)
> -		return false;
> -	xfs_bmbt_get_all(ep, gotp);
> -	return true;
> -}
> -
> -/*
> - * Returns the last extent before end, and if this extent doesn't cover
> - * end, update end to the end of the extent.
> - */
> -bool
> -xfs_iext_lookup_extent_before(
> -	struct xfs_inode	*ip,
> -	struct xfs_ifork	*ifp,
> -	xfs_fileoff_t		*end,
> -	struct xfs_iext_cursor	*cur,
> -	struct xfs_bmbt_irec	*gotp)
> -{
> -	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
> -	    gotp->br_startoff <= *end - 1)
> -		return true;
> -	if (!xfs_iext_prev_extent(ifp, cur, gotp))
> -		return false;
> -	*end = gotp->br_startoff + gotp->br_blockcount;
> -	return true;
> -}
> -
> -/*
> - * Return true if there is an extent at cursor cur and return the expanded
> - * extent structure at cur in gotp in that case.  Else return false.
> - */
> -bool
> -xfs_iext_get_extent(
> -	struct xfs_ifork	*ifp,
> -	struct xfs_iext_cursor	*cur,
> -	struct xfs_bmbt_irec	*gotp)
> -{
> -	if (cur->idx < 0 || cur->idx >= xfs_iext_count(ifp))
> -		return false;
> -	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, cur->idx), gotp);
> -	return true;
> -}
> -
> -void
> -xfs_iext_update_extent(
> -	struct xfs_inode	*ip,
> -	int			state,
> -	struct xfs_iext_cursor	*cur,
> -	struct xfs_bmbt_irec	*gotp)
> -{
> -	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
> -
> -	ASSERT(cur->idx >= 0);
> -	ASSERT(cur->idx < xfs_iext_count(ifp));
> -
> -	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
> -	xfs_bmbt_set_all(xfs_iext_get_ext(ifp, cur->idx), gotp);
> -	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
> -}
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
> index 508f13784334..015872caaab4 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.h
> +++ b/fs/xfs/libxfs/xfs_inode_fork.h
> @@ -21,45 +21,18 @@
>  struct xfs_inode_log_item;
>  struct xfs_dinode;
>  
> -/*
> - * The following xfs_ext_irec_t struct introduces a second (top) level
> - * to the in-core extent allocation scheme. These structs are allocated
> - * in a contiguous block, creating an indirection array where each entry
> - * (irec) contains a pointer to a buffer of in-core extent records which
> - * it manages. Each extent buffer is 4k in size, since 4k is the system
> - * page size on Linux i386 and systems with larger page sizes don't seem
> - * to gain much, if anything, by using their native page size as the
> - * extent buffer size. Also, using 4k extent buffers everywhere provides
> - * a consistent interface for CXFS across different platforms.
> - *
> - * There is currently no limit on the number of irec's (extent lists)
> - * allowed, so heavily fragmented files may require an indirection array
> - * which spans multiple system pages of memory. The number of extents
> - * which would require this amount of contiguous memory is very large
> - * and should not cause problems in the foreseeable future. However,
> - * if the memory needed for the contiguous array ever becomes a problem,
> - * it is possible that a third level of indirection may be required.
> - */
> -typedef struct xfs_ext_irec {
> -	xfs_bmbt_rec_host_t *er_extbuf;	/* block of extent records */
> -	xfs_extnum_t	er_extoff;	/* extent offset in file */
> -	xfs_extnum_t	er_extcount;	/* number of extents in page/block */
> -} xfs_ext_irec_t;
> -
>  /*
>   * File incore extent information, present for each of data & attr forks.
>   */
> -#define	XFS_IEXT_BUFSZ		4096
> -#define	XFS_LINEAR_EXTS		(XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
>  typedef struct xfs_ifork {
>  	int			if_bytes;	/* bytes in if_u1 */
>  	int			if_real_bytes;	/* bytes allocated in if_u1 */
>  	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
>  	short			if_broot_bytes;	/* bytes allocated for root */
>  	unsigned char		if_flags;	/* per-fork flags */
> +	int			if_height;	/* height of the extent tree */
>  	union {
> -		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
> -		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
> +		void		*if_root;	/* extent tree root */
>  		char		*if_data;	/* inline file data */
>  	} if_u1;
>  } xfs_ifork_t;
> @@ -70,7 +43,6 @@ typedef struct xfs_ifork {
>  #define	XFS_IFINLINE	0x01	/* Inline data is read in */
>  #define	XFS_IFEXTENTS	0x02	/* All extent pointers are read in */
>  #define	XFS_IFBROOT	0x04	/* i_broot points to the bmap b-tree root */
> -#define	XFS_IFEXTIREC	0x08	/* Indirection array of extent blocks */
>  
>  /*
>   * Fork handling.
> @@ -140,35 +112,12 @@ int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
>  				  int);
>  void		xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
>  
> -struct xfs_bmbt_rec_host *
> -		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
> -xfs_extnum_t	xfs_iext_count(struct xfs_ifork *);
> +xfs_extnum_t	xfs_iext_count(struct xfs_ifork *ifp);
>  void		xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
>  			xfs_extnum_t, struct xfs_bmbt_irec *, int);
> -void		xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
> -void		xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
> -					    xfs_extnum_t, int);
>  void		xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
>  			int, int);
> -void		xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
> -void		xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
> -void		xfs_iext_realloc_direct(struct xfs_ifork *, int);
>  void		xfs_iext_destroy(struct xfs_ifork *);
> -struct xfs_bmbt_rec_host *
> -		xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
> -struct xfs_ext_irec *
> -		xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
> -struct xfs_ext_irec *
> -		xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
> -				     int);
> -void		xfs_iext_irec_init(struct xfs_ifork *);
> -struct xfs_ext_irec *
> -		xfs_iext_irec_new(struct xfs_ifork *, int);
> -void		xfs_iext_irec_remove(struct xfs_ifork *, int);
> -void		xfs_iext_irec_compact(struct xfs_ifork *);
> -void		xfs_iext_irec_compact_pages(struct xfs_ifork *);
> -void		xfs_iext_irec_compact_full(struct xfs_ifork *);
> -void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
>  
>  bool		xfs_iext_lookup_extent(struct xfs_inode *ip,
>  			struct xfs_ifork *ifp, xfs_fileoff_t bno,
> @@ -185,29 +134,10 @@ void		xfs_iext_update_extent(struct xfs_inode *ip, int state,
>  			struct xfs_iext_cursor *cur,
>  			struct xfs_bmbt_irec *gotp);
>  
> -static inline void xfs_iext_first(struct xfs_ifork *ifp,
> -		struct xfs_iext_cursor *cur)
> -{
> -	cur->idx = 0;
> -}
> -
> -static inline void xfs_iext_last(struct xfs_ifork *ifp,
> -		struct xfs_iext_cursor *cur)
> -{
> -	cur->idx = xfs_iext_count(ifp) - 1;
> -}
> -
> -static inline void xfs_iext_next(struct xfs_ifork *ifp,
> -		struct xfs_iext_cursor *cur)
> -{
> -	cur->idx++;
> -}
> -
> -static inline void xfs_iext_prev(struct xfs_ifork *ifp,
> -		struct xfs_iext_cursor *cur)
> -{
> -	cur->idx--;
> -}
> +void		xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *);
> +void		xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *);
> +void		xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *);
> +void		xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *);
>  
>  static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp,
>  		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
> diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> index 5da6382bdaf1..983878019097 100644
> --- a/fs/xfs/libxfs/xfs_types.h
> +++ b/fs/xfs/libxfs/xfs_types.h
> @@ -143,7 +143,8 @@ typedef uint32_t	xfs_dqid_t;
>  #define	XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
>  
>  struct xfs_iext_cursor {
> -	xfs_extnum_t		idx;
> +	struct xfs_iext_leaf	*leaf;
> +	int			pos;
>  };
>  
>  #endif	/* __XFS_TYPES_H__ */
> diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
> index 778b709dbd0c..72d753d898d3 100644
> --- a/fs/xfs/scrub/bmap.c
> +++ b/fs/xfs/scrub/bmap.c
> @@ -168,7 +168,6 @@ xfs_scrub_bmapbt_rec(
>  	struct xfs_scrub_btree		*bs,
>  	union xfs_btree_rec		*rec)
>  {
> -	struct xfs_bmbt_rec_host	ihost;
>  	struct xfs_bmbt_irec		irec;
>  	struct xfs_scrub_bmap_info	*info = bs->private;
>  	struct xfs_inode		*ip = bs->cur->bc_private.b.ip;
> @@ -193,9 +192,7 @@ xfs_scrub_bmapbt_rec(
>  	}
>  
>  	/* Set up the in-core record and scrub it. */
> -	ihost.l0 = be64_to_cpu(rec->bmbt.l0);
> -	ihost.l1 = be64_to_cpu(rec->bmbt.l1);
> -	xfs_bmbt_get_all(&ihost, &irec);
> +	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
>  	return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec);
>  }
>  
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index a929ca72fa8e..5b5128ed0f18 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -933,7 +933,7 @@ xfs_ialloc(
>  		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
>  		ip->i_df.if_flags = XFS_IFEXTENTS;
>  		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
> -		ip->i_df.if_u1.if_extents = NULL;
> +		ip->i_df.if_u1.if_root = NULL;
>  		break;
>  	default:
>  		ASSERT(0);
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index eb6f4f7c9520..6ee5c3bf19ad 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -162,7 +162,6 @@ xfs_inode_item_format_data_fork(
>  		    ip->i_df.if_bytes > 0) {
>  			struct xfs_bmbt_rec *p;
>  
> -			ASSERT(ip->i_df.if_u1.if_extents != NULL);
>  			ASSERT(xfs_iext_count(&ip->i_df) > 0);
>  
>  			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
> @@ -252,7 +251,6 @@ xfs_inode_item_format_attr_fork(
>  
>  			ASSERT(xfs_iext_count(ip->i_afp) ==
>  				ip->i_d.di_anextents);
> -			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
>  
>  			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
>  			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 667bfce802cd..515ba042d75c 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -218,45 +218,6 @@ TRACE_EVENT(xfs_attr_list_node_descend,
>  		   __entry->bt_before)
>  );
>  
> -TRACE_EVENT(xfs_iext_insert,
> -	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
> -		 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
> -	TP_ARGS(ip, idx, r, state, caller_ip),
> -	TP_STRUCT__entry(
> -		__field(dev_t, dev)
> -		__field(xfs_ino_t, ino)
> -		__field(xfs_extnum_t, idx)
> -		__field(xfs_fileoff_t, startoff)
> -		__field(xfs_fsblock_t, startblock)
> -		__field(xfs_filblks_t, blockcount)
> -		__field(xfs_exntst_t, state)
> -		__field(int, bmap_state)
> -		__field(unsigned long, caller_ip)
> -	),
> -	TP_fast_assign(
> -		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> -		__entry->ino = ip->i_ino;
> -		__entry->idx = idx;
> -		__entry->startoff = r->br_startoff;
> -		__entry->startblock = r->br_startblock;
> -		__entry->blockcount = r->br_blockcount;
> -		__entry->state = r->br_state;
> -		__entry->bmap_state = state;
> -		__entry->caller_ip = caller_ip;
> -	),
> -	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
> -		  "offset %lld block %lld count %lld flag %d caller %ps",
> -		  MAJOR(__entry->dev), MINOR(__entry->dev),
> -		  __entry->ino,
> -		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
> -		  (long)__entry->idx,
> -		  __entry->startoff,
> -		  (int64_t)__entry->startblock,
> -		  __entry->blockcount,
> -		  __entry->state,
> -		  (char *)__entry->caller_ip)
> -);
> -
>  DECLARE_EVENT_CLASS(xfs_bmap_class,
>  	TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state,
>  		 unsigned long caller_ip),
> @@ -264,7 +225,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
>  	TP_STRUCT__entry(
>  		__field(dev_t, dev)
>  		__field(xfs_ino_t, ino)
> -		__field(xfs_extnum_t, idx)
> +		__field(void *, leaf);
> +		__field(int, pos);
>  		__field(xfs_fileoff_t, startoff)
>  		__field(xfs_fsblock_t, startblock)
>  		__field(xfs_filblks_t, blockcount)
> @@ -280,7 +242,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
>  		xfs_iext_get_extent(ifp, cur, &r);
>  		__entry->dev = VFS_I(ip)->i_sb->s_dev;
>  		__entry->ino = ip->i_ino;
> -		__entry->idx = cur->idx;
> +		__entry->leaf = cur->leaf;
> +		__entry->pos = cur->pos;
>  		__entry->startoff = r.br_startoff;
>  		__entry->startblock = r.br_startblock;
>  		__entry->blockcount = r.br_blockcount;
> @@ -288,12 +251,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
>  		__entry->bmap_state = state;
>  		__entry->caller_ip = caller_ip;
>  	),
> -	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
> +	TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d "
>  		  "offset %lld block %lld count %lld flag %d caller %ps",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  __entry->ino,
>  		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
> -		  (long)__entry->idx,
> +		  __entry->leaf,
> +		  __entry->pos,
>  		  __entry->startoff,
>  		  (int64_t)__entry->startblock,
>  		  __entry->blockcount,
> @@ -306,6 +270,7 @@ DEFINE_EVENT(xfs_bmap_class, name, \
>  	TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \
>  		 unsigned long caller_ip), \
>  	TP_ARGS(ip, cur, state, caller_ip))
> +DEFINE_BMAP_EVENT(xfs_iext_insert);
>  DEFINE_BMAP_EVENT(xfs_iext_remove);
>  DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
>  DEFINE_BMAP_EVENT(xfs_bmap_post_update);
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux