On Mon, Jun 22, 2015 at 08:24:38PM -0700, mingming cao wrote: > --- > ext4_btree.c | 1356 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 1356 insertions(+) > create mode 100644 ext4_btree.c > > diff --git a/ext4_btree.c b/ext4_btree.c > new file mode 100644 > index 0000000..baf253c > --- /dev/null > +++ b/ext4_btree.c > @@ -0,0 +1,1356 @@ > +/* > + * copyright (C) 2015 Oracle. All rights reserved. (Nit-picking, but this should be a capital-C "Copyright") > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License v2 as published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public > + * License along with this program; if not, write to the > + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, > + * Boston, MA 021110-1307, USA. > + */ > + > + > + > + > +#include "ext4_btree.h" > + > +/* > + * Calculate offset of the n-th key in a btree node. > + */ > +static inline unsigned int > +ext4_btree_key_offset(struct ext4_btree_geo *geo, unsigned int n) > +{ > + return geo->header_len + n * geo->key_len; Parentheses around the multiplicative terms would make this a little easier to scan a line for which variables go together, i.e. return geo->header_len + (n * geo->key_len); (Still, a minor nit.) > +} > + > +/* > + * Calculate offset of the n-th node pointer in a btree node. > + */ > +static inline unsigned int > +ext4_btree_val_offset(struct ext4_btree_geo *geo, unsigned int n) > +{ > + return geo->header_len + > + geo->max_pairs * geo->key_len + > + n * geo->val_len; > +} > + > +/* > + * Calculate offset of the n-th record in a btree leaf node. > + */ > +static inline unsigned int > +ext4_btree_rec_offset(struct ext4_btree_geo *geo, unsigned int n) > +{ > + return geo->header_len + n * geo->rec_len; > +} > + > +/* > + * Node header access functions > + */ > +static inline struct ext4_btree_header* > +ext4_btree_header_addr(struct ext4_btree_node *block) > +{ > + return (struct ext4_btree_header *)block; > +} > + > +static inline unsigned int > +ext4_btree_get_numkeys(struct ext4_btree_node *node) > +{ > + return ext4_btree_header_addr(node)->btree_numkeys; > +} > + > +static inline void > +ext4_btree_update_numkeys(struct ext4_btree_node *node, unsigned int n) > +{ > + ext4_btree_header_addr(node)->btree_numkeys = n; > +} > + > +static inline unsigned int > +ext4_btree_get_numrecs(struct ext4_btree_node *node) > +{ > + return ext4_btree_header_addr(node)->btree_numkeys; > +} > + > +static inline void > +ext4_btree_update_numrecs(struct ext4_btree_node *node, unsigned int n) > +{ > + ext4_btree_header_addr(node)->btree_numkeys = n; > +} > + > +static inline unsigned int > +ext4_btree_get_level(struct ext4_btree_node *node) > +{ > + return ext4_btree_header_addr(node)->btree_level; > +} > + > +static inline void > +ext4_btree_update_level(struct ext4_btree_node *node, unsigned int level) > +{ > + ext4_btree_header_addr(node)->btree_level = level; > +} > + > +static inline unsigned long long > +ext4_btree_get_blockno(struct ext4_btree_node *node) > +{ > + return ext4_btree_header_addr(node)->btree_blockno; > +} > + > +static inline void > +ext4_btree_update_blockno(struct ext4_btree_node *node, > + unsigned long long blockno) > +{ > + ext4_btree_header_addr(node)->btree_blockno = blockno; > +} > + > +/* > +* Get n-th key address from btree node > +*/ > +static struct ext4_btree_key* > +ext4_btree_key_addr(struct ext4_btree_geo *geo, > + struct ext4_btree_node * node, > + unsigned int n) > +{ > + return (struct ext4_btree_key *) > + ((unsigned char *)node + ext4_btree_key_offset(geo, n)); > +} > + > +/* > + * Set n-th key from btree node > + */ > +static void > +ext4_btree_set_key(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node, > + unsigned int n, > + struct ext4_btree_key *key) > +{ > + struct ext4_btree_key *keyoffset = ext4_btree_key_addr(geo, node, n); > + *keyoffset = *key; > +} > + > +static void > +ext4_btree_clear_key(struct ext4_btree_key *key) > +{ > + key->blockno = 0; > +} > + > +/* > + * Get n-th val address from btree node > + */ > +static struct ext4_btree_val* > +ext4_btree_val_addr(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node, > + unsigned int n) > +{ > + return (struct ext4_btree_val *) > + ((unsigned char *)node + ext4_btree_val_offset(geo, n)); > +} > + > +/* > + * Set n-th val from btree node > + */ > +static void > +ext4_btree_set_val(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node, > + unsigned int n, > + struct ext4_btree_val *val) > +{ > + struct ext4_btree_val *valaddr = ext4_btree_val_addr(geo, node, n); > + *valaddr = *val; > +} > + > +static void > +ext4_btree_clear_val(struct ext4_btree_val *val) > +{ > + val->blockno = 0; > +} > + > +/* > + * Get n-th record address from btree node > + */ > +static struct ext4_btree_rec* > +ext4_btree_rec_addr(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node, > + unsigned int n) > +{ > + return (struct ext4_btree_rec *) > + ((unsigned char *)node + ext4_btree_rec_offset(geo, n)); > +} > + > + > +/* > + * Set n-th value from btree node > + */ > +static void > +ext4_btree_set_rec(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node, > + unsigned int n, > + struct ext4_btree_rec *rec) > +{ > + struct ext4_btree_rec *rec_offset = ext4_btree_rec_addr(geo, node, n); > + *rec_offset = *rec; > +} > + > +static void > +ext4_btree_clear_rec(struct ext4_btree_rec *rec) > +{ > + rec->key.blockno = 0; > + rec->len = 0; > + rec->ref_count = 0; > +} > + > +/* > + * Initialize btree root node > + */ > + > +void > +ext4_btree_root_init(struct ext4_btree_root *root, > + struct ext4_btree_geo *geo) > +{ > + root->node = NULL; > + root->geo = (*geo); > +} > + > +/* > + * Initialize ref btree root node > + */ > +void > +ext4_ref_btree_init(struct ext4_btree_root *root) > +{ > + ext4_btree_root_init(root, &ref_btree_geo); > +} > + > +/* > + * Allocate a btree node > + */ > +struct ext4_btree_node * > +ext4_btree_node_alloc() > +{ > + return fs_alloc_btree_node(); I'm assuming this is defined somewhere in your test harness? > +} > + > +/* > + * Free a btree node > + */ > + > +void > +ext4_btree_node_free( struct ext4_btree_node *node) > +{ > + fs_free_btree_node(node); > +} > + > +/* > + * Compare two btree keys. > + * Return 1 if key1 > key2. > + * Return 0 if key1 == key2. > + * Return -1 if key1 < key2. > + */ > +int > +ext4_btree_key_comp(struct ext4_btree_geo *geo, > + struct ext4_btree_key *key1, > + struct ext4_btree_key *key2) > +{ > + if (key1->blockno < key2->blockno) > + return -1; > + else if (key1->blockno > key2->blockno) > + return 1; > + else > + return 0; > +} > + > +/* > + * If specified key within record' range > + * Return -1 if key < record's key > + * Return 0 if record has the key > + * Return 1 if record's key + len < key > + */ > +int > +ext4_btree_rec_comp(struct ext4_btree_geo *geo, > + struct ext4_btree_key *key, > + struct ext4_btree_rec *rec) > +{ > + if (key->blockno < rec->key.blockno) > + return -1; > + else if ((rec->key.blockno + rec->len) <= key->blockno) > + return 1; > + else > + return 0; > +} > + > +/* > + * Check if the given record has this key > + * Return 1 if record has this key within range > + * Return 0 if not > + */ > +static inline int > +ext4_btree_rec_has_key(struct ext4_btree_geo *geo, > + struct ext4_btree_rec *rec, > + struct ext4_btree_key *key) > +{ > + return ((rec->key.blockno <= key->blockno) && > + ((rec->key.blockno + rec->len) > key->blockno)); > +} > + > +static inline void > +ext4_btree_set_search_path(struct ext4_btree_search_path * spath, > + int level, > + struct ext4_btree_node * node, > + int pos) > +{ > + if (spath) { > + spath->node[level] = node; > + spath->pos[level] = pos; > + } > +} > + > +/* define the btree layout for refcount btree */ > +struct ext4_btree_geo ref_btree_geo = { > + EXT4_BTREE_NODESIZE, > + EXT4_BTREE_HEADER_SIZE, > + EXT4_BTREE_KEY_SIZE, > + EXT4_BTREE_VAL_SIZE, > + EXT4_BTREE_REC_SIZE, > + EXT4_BTREE_MAX_KEY_VAL_PAIRS, > + EXT4_BTREE_MAX_RECS > +}; > + > +/* remember the index btree node on the search path */ > +struct ext4_btree_search_path { > + struct ext4_btree_node * node[EXT4_BTREE_MAX_LEVELS]; > + int pos [EXT4_BTREE_MAX_LEVELS]; > +}; > + > + > +/* > + * Initialize the search path > + */ > +void > +ext4_btree_init_search_path (struct ext4_btree_search_path *spath) > +{ > + int i; > + for (i=0; i< EXT4_BTREE_MAX_LEVELS; i++) { > + spath->node[i] = NULL; > + spath->pos[i] = EXT4_BTREE_MAX_KEY_VAL_PAIRS; > + } > +} > + > + > +/* > + * Debug function to print out the whole tree > + */ > + > +void > +ext4_btree_rec_node_print(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node) > +{ > + int i; > + struct ext4_btree_rec *rec; > + int num_recs; > + > + if (node == NULL) > + return; > + num_recs = ext4_btree_get_numrecs(node); > + ext4_btree_trace("==rec== node in block %lld - level %d numrecs %d\n", > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + ext4_btree_get_numrecs(node)); > + for (i = 0; i < num_recs; i++) { > + rec = ext4_btree_rec_addr(geo, node, i); > + ext4_btree_trace("rec%d key 0x%llx len 0x%x refcount %d\n", > + i, rec->key.blockno, rec->len, rec->ref_count); > + } > +} > + > +void > +ext4_btree_index_node_print(struct ext4_btree_geo *geo, > + struct ext4_btree_node *node) > +{ > + int i; > + int num_keys; > + struct ext4_btree_key *key; > + struct ext4_btree_val *val; > + > + num_keys = ext4_btree_get_numkeys(node); > + ext4_btree_trace("--key-- node in block %lld - level %d numkeys %d\n", > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + ext4_btree_get_numkeys(node)); > + for (i = 0; i < num_keys; i++) { > + key = ext4_btree_key_addr(geo, node, i); > + val = ext4_btree_val_addr(geo, node, i); > + ext4_btree_trace("pair%d key 0x%llx val 0x%llx\n", > + i, key->blockno, val->blockno); > + } > +} > + > +void > +ext4_btree_print(struct ext4_btree_root *root) > +{ > + struct ext4_btree_search_path spath; > + struct ext4_btree_node * node; > + struct ext4_btree_val * val; > + int level; > + int rootlevel; > + int pos; > + int numkeys; > + > + if (root == NULL || root->node == NULL) { > + ext4_btree_trace("Empty tree\n"); > + return; > + } > + > + ext4_btree_trace("Btree Details:\n\n"); > + ext4_btree_init_search_path(&spath); > + node = root->node; > + level = rootlevel = ext4_btree_get_level(node); > + pos = 0; > + while (level >= 0) { > + spath.node[level] = node; > + spath.pos[level] = pos; > + if (level > 0) { > + if (pos == 0) > + ext4_btree_index_node_print(&root->geo, node); > + numkeys = ext4_btree_get_numkeys(node); > + if (pos == numkeys) { > + if (level == rootlevel) > + break; /* reach the end > + /* go to parent node */ > + level ++; > + node = spath.node[level]; > + pos = spath.pos[level] + 1; > + } > + else { > + /* go to next child node */ > + level --; > + val = ext4_btree_val_addr(&root->geo, > + node, pos); > + node = fs_get_btree_node(val->blockno); > + pos = 0 ; > + } > + } > + else { > + ext4_btree_rec_node_print(&root->geo, node); > + if (level == rootlevel) > + break; /* reach the end > + /* go to parent node; */ > + level ++; > + node = spath.node[level]; > + pos = spath.pos[level] + 1; > + } > + } > + ext4_btree_trace("\n"); > +} > + > + > +/* > + * Lookup for a record contain the specified key in btree > + * Return NULL if the key is not found > + */ > +struct ext4_btree_rec* > +ext4_btree_search_key(struct ext4_btree_root *root, > + struct ext4_btree_key *key, > + struct ext4_btree_search_path * spath) > +{ > + unsigned int i; > + int level; > + struct ext4_btree_node *node; > + struct ext4_btree_key *tmp_key; > + struct ext4_btree_val *tmp_val; > + struct ext4_btree_geo *geo; > + struct ext4_btree_rec *rec = NULL; > + > + if (root == NULL || root->node == NULL) > + return NULL; > + /* Search through the key-val index nodes. */ > + node = root->node; > + geo = &root->geo; > + level = ext4_btree_get_level(node); > + while (level > 0) { > + for (i = 0; i < ext4_btree_get_numkeys(node)-1; i++) { > + tmp_key = ext4_btree_key_addr(geo, node, i + 1); > + if (ext4_btree_key_comp(geo, key, tmp_key) < 0) > + break; > + } If the keys and records are stored in sorted order, could you bsearch here instead of linear scanning? Granted the difference might be inconsequential for the ~252 records in a 4K node, but for a 64k node that's a linear scan of ~4092 records. This goes for all the linear scans I see. > + ext4_btree_set_search_path(spath, level, node, i); > + tmp_val = ext4_btree_val_addr(geo, node, i); > + node = fs_get_btree_node(tmp_val->blockno); > + /* read failure*/ > + if (node == NULL) > + return NULL; I wonder if there ought to be a facility for returning integer error codes and passing in a **node (as an out param) instead of using NULL to cover "not found" and "badness happened". > + level --; > + } > + /* Search the leaf node */ > + assert(ext4_btree_get_level(node) == 0); > + rec = ext4_btree_rec_addr(geo, node, 0); > + i = 0; > + while (ext4_btree_rec_comp(geo, key, rec) > 0) { > + i++; > + if (i >= ext4_btree_get_numkeys(node)) { > + ext4_btree_set_search_path(spath, 0, node, i); > + return NULL; > + } > + rec = ext4_btree_rec_addr(geo, node, i); > + } > + ext4_btree_set_search_path(spath, 0, node, i); > + if (ext4_btree_rec_comp(geo, key, rec) == 0) > + return rec; > + else > + return NULL; > +} > + > +/* > + * Lookup for a record contain the specified key in btree > + * Return NULL if the key is not found > + */ > +struct ext4_btree_rec* > +ext4_btree_lookup(struct ext4_btree_root *root, struct ext4_btree_key *key) > +{ > + return ext4_btree_search_key(root, key, NULL); > +} > + > +/* > + * Insert a rec into a leaf node at specifid position > + */ > +void > +ext4_btree_node_insert_in_leaf(struct ext4_btree_root *root, > + struct ext4_btree_node *node, > + struct ext4_btree_rec *rec, > + int pos) > +{ > + int numrecs; > + int i; > + struct ext4_btree_rec *tmprec; > + > + numrecs= ext4_btree_get_numrecs(node); > + for (i = numrecs; i > pos;i--) { > + tmprec = ext4_btree_rec_addr(&root->geo, node, i-1); > + ext4_btree_set_rec(&root->geo, node, i,tmprec); > + } memmove? > + ext4_btree_set_rec(&root->geo, node, pos, rec); > + ext4_btree_update_numrecs(node,numrecs+1); > +} > + > +/* > + * Split a leaf node into two > + */ > +struct ext4_btree_node * > +ext4_btree_split_leaf(struct ext4_btree_root *root, > + struct ext4_btree_node *node) > +{ > + struct ext4_btree_node *rnode; > + struct ext4_btree_rec *tmprec; > + unsigned int i; > + > + rnode = ext4_btree_node_alloc(); > + if (!rnode) { > + /* Cant allocate a new node, ERR*/ > + return NULL; > + } > + for (i = EXT4_BTREE_MAX_RECS/2; i < EXT4_BTREE_MAX_RECS;i++) { > + tmprec = ext4_btree_rec_addr(&root->geo, node, i); > + ext4_btree_set_rec(&root->geo, rnode, > + (i-EXT4_BTREE_MAX_RECS/2), tmprec); > + ext4_btree_clear_rec(tmprec); memcpy/memset? > + } > + ext4_btree_update_numrecs(rnode,EXT4_BTREE_MAX_RECS/2); > + ext4_btree_update_numrecs(node, EXT4_BTREE_MAX_RECS/2); > + return rnode; > +} > + > +/* > + * Split a index node into two > + */ > +struct ext4_btree_node * > +ext4_btree_split_index(struct ext4_btree_root *root, > + struct ext4_btree_node *node) > +{ > + struct ext4_btree_node *rnode; > + struct ext4_btree_key *tmp_key; > + struct ext4_btree_val *tmp_val; > + unsigned int i; > + > + rnode = ext4_btree_node_alloc(); > + if (!rnode) { > + /* Cant allocate a new node, ERR*/ > + return NULL; > + } > + /* split key-val pairs between old node and new node */ > + for (i = EXT4_BTREE_MAX_KEY_VAL_PAIRS/2; > + i < EXT4_BTREE_MAX_KEY_VAL_PAIRS; > + i++) { > + tmp_key = ext4_btree_key_addr(&root->geo, node, i); > + ext4_btree_set_key(&root->geo, rnode, > + (i-EXT4_BTREE_MAX_KEY_VAL_PAIRS/2),tmp_key); > + ext4_btree_clear_key(tmp_key); > + tmp_val = ext4_btree_val_addr(&root->geo, node, i); > + ext4_btree_set_val(&root->geo, rnode, > + (i-EXT4_BTREE_MAX_KEY_VAL_PAIRS/2),tmp_val); > + ext4_btree_clear_val(tmp_val); > + } > + /* set level and numkeys in new node */ > + ext4_btree_update_level(rnode, ext4_btree_get_level(node)); > + ext4_btree_update_numkeys(rnode,EXT4_BTREE_MAX_KEY_VAL_PAIRS/2); > + /* set numkeys in old node */ > + ext4_btree_update_numkeys(node, EXT4_BTREE_MAX_KEY_VAL_PAIRS/2); > + return rnode; > +} > + > + > +/* > + * Insert a key-val pair into a index node at specified position > + */ > +void > +ext4_btree_node_insert_in_index(struct ext4_btree_root *root, > + struct ext4_btree_node *node, > + struct ext4_btree_key *key, > + struct ext4_btree_val *val, > + int pos) > +{ > + int i; > + int numkeys; > + struct ext4_btree_key *pkey; > + struct ext4_btree_val *pval; > + > + numkeys = ext4_btree_get_numkeys(node); > + for (i = numkeys - 1; i >= pos; i--) { > + pkey = ext4_btree_key_addr(&root->geo, node, i); > + ext4_btree_set_key(&root->geo, node, i + 1, pkey); > + pval = ext4_btree_val_addr(&root->geo, node, i); > + ext4_btree_set_val(&root->geo, node, i + 1, pval); > + } > + ext4_btree_set_key(&root->geo, node, pos, key); > + ext4_btree_set_val(&root->geo, node, pos, val); > + ext4_btree_update_numkeys(node, numkeys + 1); > +} > + > +/* > + * Grow tree by one more level. Return the new root node. > + */ > +struct ext4_btree_node * > +ext4_btree_grow(struct ext4_btree_root *root, > + struct ext4_btree_node *lnode, > + struct ext4_btree_node *rnode, > + int level) > +{ > + struct ext4_btree_node * newroot; > + struct ext4_btree_key * key; > + struct ext4_btree_val val; > + > + newroot = ext4_btree_node_alloc(); > + if (!newroot) { > + /* Cant allocate a new node, ERR*/ > + return NULL; > + } > + > + ext4_btree_update_level(newroot, level); > + > + key = ext4_btree_key_addr(&root->geo, lnode, 0); > + ext4_btree_set_key(&root->geo, newroot, 0, key); > + val.blockno = ext4_btree_get_blockno(lnode); > + ext4_btree_set_val(&root->geo, newroot, 0, &val); > + > + key = ext4_btree_key_addr(&root->geo, rnode, 0); > + ext4_btree_set_key(&root->geo, newroot, 1, key); > + val.blockno = ext4_btree_get_blockno(rnode); > + ext4_btree_set_val(&root->geo, newroot, 1, &val); > + > + ext4_btree_update_numkeys(newroot, 2); > + > + return newroot; > + > +} > + > +/* > + * Insert a record to the btree > + */ > +int > +ext4_btree_insert(struct ext4_btree_root *root, struct ext4_btree_rec *rec) > +{ > + unsigned int level; > + struct ext4_btree_node *node, *rnode, *newroot; > + struct ext4_btree_key *key, *rkey; > + struct ext4_btree_val rval; > + struct ext4_btree_search_path spath; > + int pos, full, numkeys; > + struct ext4_btree_rec *searchrec; > + > + if (root->node == NULL) { > + /* empty tree */ > + node = ext4_btree_node_alloc(); > + if (node == NULL) > + return -1; > + root->node = node; > + ext4_btree_update_level(root->node, 0); > + ext4_btree_trace( > + "==rec 0x%llx will be insert in node in block %lld "\ > + "- level %d pos %d\n", > + rec->key.blockno, > + ext4_btree_get_blockno(root->node), > + ext4_btree_get_level(root->node), > + 0); > + > + ext4_btree_node_insert_in_leaf(root, root->node, rec, 0); > + return 0; > + } > + > + /* > + * First we search if the key already present, > + * In the search process, all levels node pointer and position are stored > + * in search pointer for later use > + * there is no duplicated key allowed in the tree > + */ > + ext4_btree_init_search_path(&spath); > + key = &rec->key; > + searchrec = ext4_btree_search_key(root, key, &spath); > + > + if (searchrec) { > + node = spath.node[0]; > + pos = spath.pos[0]; > + ext4_btree_trace("==rec 0x%llx found in node in block %lld " \ > + "- level %d pos %d\n", > + rec->key.blockno, > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + pos); > + return -1; > + } > + level = 0; > + node = spath.node[0]; > + pos = spath.pos[0]; > + full = ext4_btree_get_numkeys(node) == EXT4_BTREE_MAX_RECS; > + ext4_btree_trace( > + "==rec 0x%llx will be insert in node in block %lld "\ > + "- level %d pos %d\n", > + rec->key.blockno, > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + pos); > + > + if (!full) { > + ext4_btree_node_insert_in_leaf(root, node, rec, pos); > + while (pos == 0 && > + (++level <= ext4_btree_get_level(root->node))) { > + /* update parent key */ > + node = spath.node[level]; > + pos = spath.pos[level]; > + ext4_btree_set_key(&root->geo, node, pos, key); > + } > + return 0; > + } > + > + /* check if B tree root is full and level reach the max */ > + if ((ext4_btree_get_level(root->node) == EXT4_BTREE_MAX_LEVELS - 1) && > + (ext4_btree_get_numkeys(root->node) > + == EXT4_BTREE_MAX_KEY_VAL_PAIRS)) { > + ext4_btree_trace("Tree reach max level, no more insert\n"); > + return -1; // Tree reach the max level > + } > + > + /* leaf node is full, split it to make space to insert new rec*/ > + numkeys = ext4_btree_get_numkeys(node); > + rnode = ext4_btree_split_leaf(root, node); > + if (rnode == NULL) > + return -1; > + if (pos > EXT4_BTREE_MAX_RECS/2) > + ext4_btree_node_insert_in_leaf(root, rnode, rec, > + pos - EXT4_BTREE_MAX_RECS/2); > + else > + ext4_btree_node_insert_in_leaf(root, node, rec, pos); > + > + /* split index nodes if full, all the way up to root */ > + while (level < ext4_btree_get_level(root->node)) { > + /* Pick up the first key*/ > + rkey = ext4_btree_key_addr(&root->geo, rnode, 0); > + rval.blockno = ext4_btree_get_blockno(rnode); > + level ++; > + node = spath.node[level]; > + pos = spath.pos[level]; > + numkeys = ext4_btree_get_numkeys(node); > + full = (numkeys == EXT4_BTREE_MAX_KEY_VAL_PAIRS); > + if (!full) { > + ext4_btree_node_insert_in_index(root, node, rkey, > + &rval, pos + 1); > + break; > + } > + rnode = ext4_btree_split_index(root, node); > + if (rnode == NULL) > + return -1; > + if (pos > EXT4_BTREE_MAX_KEY_VAL_PAIRS/2 ) { > + ext4_btree_node_insert_in_index(root, rnode, rkey, > + &rval, pos - EXT4_BTREE_MAX_KEY_VAL_PAIRS/2 + 1); > + } else { > + ext4_btree_node_insert_in_index(root, node, rkey, > + &rval, pos + 1); > + } > + } > + if (level == ext4_btree_get_level(node) && full) { > + /* If root is split, grow tree by one more level and assign new root */ > + newroot = ext4_btree_grow(root, node, rnode, level + 1); > + if (newroot != NULL) { > + root->node = newroot; > + } > + } > + return 0; > +} > + > + > +/* > + * Delete one record from leaf node > + */ > +void > +ext4_btree_delete_one(struct ext4_btree_root *root, > + struct ext4_btree_node *node, > + int pos) > +{ > + unsigned int i; > + struct ext4_btree_rec* tmprec; > + unsigned int numrecs; > + > + numrecs= ext4_btree_get_numrecs(node); > + for (i = pos; i< numrecs - 1; i++) { > + tmprec = ext4_btree_rec_addr(&root->geo, node, i + 1); > + ext4_btree_set_rec(&root->geo, node, i, tmprec); > + } > + ext4_btree_update_numrecs(node, numrecs - 1); > +} > + > +/* > + * Delete one key/val pair from index node > + */ > + > +void > +ext4_btree_delete_one_keyval(struct ext4_btree_root *root, > + struct ext4_btree_node *node, > + int pos) > +{ > + unsigned int i; > + struct ext4_btree_key* key; > + struct ext4_btree_val* val; > + unsigned int numkeys; > + > + numkeys= ext4_btree_get_numkeys(node); > + for (i = pos; i< numkeys - 1; i++) { > + key = ext4_btree_key_addr(&root->geo, node, i + 1); > + val = ext4_btree_val_addr(&root->geo, node, i + 1); > + ext4_btree_set_key(&root->geo, node, i, key); > + ext4_btree_set_val(&root->geo, node, i, val); > + } > + ext4_btree_update_numkeys(node, numkeys - 1); > + > +} > + > + > +/* > + * Borrow one record or key/val pair from left sibling > + */ > +void > +ext4_btree_borrow_one_left (struct ext4_btree_root *root, > + struct ext4_btree_node *parent, > + struct ext4_btree_node *lnode, > + struct ext4_btree_node *rnode, > + int pos, > + int level) > +{ > + struct ext4_btree_rec *rec; > + struct ext4_btree_key *key; > + struct ext4_btree_val *val; > + int numrecs; > + int numkeys; > + > + if (level == 0) { > + numrecs = ext4_btree_get_numrecs(lnode); > + rec = ext4_btree_rec_addr(&root->geo, lnode, numrecs - 1); > + key = &rec->key; > + ext4_btree_node_insert_in_leaf(root, rnode, rec, 0); > + ext4_btree_delete_one(root, lnode, numrecs - 1); > + } else { > + numkeys = ext4_btree_get_numkeys(lnode); > + key = ext4_btree_key_addr(&root->geo, lnode, numkeys - 1); > + val = ext4_btree_val_addr(&root->geo, lnode, numkeys - 1); > + ext4_btree_node_insert_in_index(root, rnode, key, val, 0); > + ext4_btree_delete_one_keyval(root, lnode, numkeys - 1); > + } > + /* update parent node key */ > + ext4_btree_set_key(&root->geo, parent, pos, key); > + > +} > + > +/* > + * Borrow one record or key/val pair from right sibling > + */ > +void > +ext4_btree_borrow_one_right (struct ext4_btree_root *root, > + struct ext4_btree_node *parent, > + struct ext4_btree_node *lnode, > + struct ext4_btree_node *rnode, > + int pos, > + int level) > +{ > + struct ext4_btree_rec *rec; > + struct ext4_btree_key *key; > + struct ext4_btree_val *val; > + int numrecs; > + int numkeys; > + > + if (level == 0) { > + rec = ext4_btree_rec_addr(&root->geo, rnode, 0); > + key = &rec->key; > + numrecs = ext4_btree_get_numrecs(lnode); > + ext4_btree_node_insert_in_leaf(root, lnode, rec, numrecs); > + ext4_btree_delete_one(root, rnode, 0); > + } else { > + key = ext4_btree_key_addr(&root->geo, rnode, 0); > + val = ext4_btree_val_addr(&root->geo, rnode, 0); > + numkeys = ext4_btree_get_numkeys(lnode); > + ext4_btree_node_insert_in_index(root, lnode, key, val, numkeys); > + ext4_btree_delete_one_keyval(root, rnode, 0); > + } > + /* update parent node key */ > + ext4_btree_set_key(&root->geo, parent, pos + 1, key); > +} > + > +int > +ext4_btree_rotate(struct ext4_btree_root *root, > + struct ext4_btree_search_path *spath, > + struct ext4_btree_node *node, int level) > +{ > + struct ext4_btree_val * val; > + struct ext4_btree_node *parent, *lnode, *rnode; > + int pos = 0; > + int numkeys; > + > + parent = spath->node[level + 1]; > + pos = spath->pos[level + 1]; > + > + if (pos > 0) { > + /* check the left sibling first*/ > + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); node->node_header.leftsib? > + lnode = fs_get_btree_node(val->blockno); > + if (lnode) { > + numkeys = ext4_btree_get_numkeys(lnode); > + if (numkeys >= EXT4_BTREE_MIN_RECS + 1) { > + /* we could shift a record from left sibling to the node*/ > + ext4_btree_borrow_one_left(root, parent, lnode, > + node, pos, level); > + return 0; > + } > + } > + } > + > + numkeys = ext4_btree_get_numkeys(parent); > + if (pos < numkeys -1) { > + val = ext4_btree_val_addr(&root->geo, parent, pos + 1); > + rnode = fs_get_btree_node(val->blockno); > + if (rnode) { > + numkeys = ext4_btree_get_numkeys(rnode); > + if (numkeys >= EXT4_BTREE_MIN_RECS + 1) { > + /* we could shift a record from left sibling to the node*/ > + ext4_btree_borrow_one_right(root, parent, node, > + rnode, pos, level); > + return 0; > + } > + } > + } > + > + return -1; > + > +} > + > +/* > + * Merge leaf nodes > + */ > + > +int > +ext4_btree_merge_leaf(struct ext4_btree_root *root, > + struct ext4_btree_search_path *spath, > + struct ext4_btree_node *node) > +{ > + int num, lnum, rnum; > + struct ext4_btree_node * parent, *lnode, *rnode; > + int i, pos; > + struct ext4_btree_rec *rec; > + struct ext4_btree_val *val; > + > + parent = spath->node[1]; > + pos = spath->pos[1]; > + > + num = ext4_btree_get_numrecs(node); > + > + /* try to merge left sibling first */ > + if (pos > 0) { > + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); > + lnode = fs_get_btree_node(val->blockno); > + if (!lnode) { > + return -1; > + /* err!*/ > + } > + > + lnum = ext4_btree_get_numrecs(lnode); > + > + for (i = 0; i < num; i++) { > + rec = ext4_btree_rec_addr(&root->geo, node, i); > + ext4_btree_node_insert_in_leaf(root, lnode, rec, lnum+i); > + } > + // delete parent key-val pair > + ext4_btree_delete_one_keyval(root, parent, pos); > + ext4_btree_node_free(node); > + return 0; > + } > + /* then try to merge with right sibling */ > + val = ext4_btree_val_addr(&root->geo, parent, pos + 1); > + rnode = fs_get_btree_node(val->blockno); > + if (!rnode) { > + return -1; > + /* err!*/ > + } > + > + rnum = ext4_btree_get_numrecs(rnode); > + > + for (i = 0; i < rnum; i++) { > + rec = ext4_btree_rec_addr(&root->geo, rnode, i); > + ext4_btree_node_insert_in_leaf(root, node, rec, num+i); > + } > + // delete parent key-val pair > + ext4_btree_delete_one_keyval(root, parent, pos + 1); > + ext4_btree_node_free(rnode); > + return 0; > +} > + > + > +/* > + * Merge index nodes > + */ > + > +int > +ext4_btree_merge_index(struct ext4_btree_root *root, > + struct ext4_btree_search_path *spath, > + struct ext4_btree_node *node, int level) > + > +{ > + int num, lnum, rnum, pnum; > + struct ext4_btree_node * parent, *lnode, *rnode; > + int i, pos; > + struct ext4_btree_key *key; > + struct ext4_btree_val *val; > + > + parent = spath->node[level + 1]; > + pos = spath->pos[level + 1]; > + > + num = ext4_btree_get_numkeys(node); > + > + /* try to merge left sibling first */ > + if (pos > 0) { > + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); > + lnode = fs_get_btree_node(val->blockno); > + if (!lnode) { > + /* err!*/ > + return -1; > + } > + > + lnum = ext4_btree_get_numkeys(lnode); > + > + for (i = 0; i < num; i++) { > + key = ext4_btree_key_addr(&root->geo, node, i); > + val = ext4_btree_val_addr(&root->geo, node, i); > + ext4_btree_node_insert_in_index(root, lnode, key, > + val, lnum + i); > + } > + // delete parent key-val pair > + ext4_btree_delete_one_keyval(root, parent, pos); > + ext4_btree_node_free(node); > + return 0; > + } > + pnum = ext4_btree_get_numkeys(parent); > + if (pnum > 1) { > + /* then try to merge with right sibling */ > + val = ext4_btree_val_addr(&root->geo, parent, 1); /* pos is always 0*/ > + rnode = fs_get_btree_node(val->blockno); > + if (!rnode) { > + return -1; > + /* err!*/ > + } > + > + rnum = ext4_btree_get_numkeys(rnode); > + > + for (i = 0; i < rnum; i++) { > + key = ext4_btree_key_addr(&root->geo, rnode, i); > + val = ext4_btree_val_addr(&root->geo, rnode, i); > + ext4_btree_node_insert_in_index(root, node, key, > + val, num + i); > + } > + // delete parent key-val pair > + ext4_btree_delete_one_keyval(root, parent, pos + 1); > + ext4_btree_node_free(rnode); > + ext4_btree_print(root); > + } else{ > + /* shrink level */ > + ext4_btree_node_free(root->node); > + root->node = node; > + ext4_btree_print(root); > + } > + > + return 0; > +} > + > + > +/* > + * Delete a record from the btree > + */ > +int > +ext4_btree_delete(struct ext4_btree_root *root, struct ext4_btree_key *key) > +{ > + struct ext4_btree_node *node, *parent; > + struct ext4_btree_search_path spath; > + int pos, p_pos; > + struct ext4_btree_rec *searchrec; > + int level; > + int tree_height = ext4_btree_get_level(root->node); > + int ret; > + > + /* > + * First we search if the key already present, > + * In the search process, all levels node pointer and position are stored > + * in search pointer for later use > + * there is no duplicated key allowed in the tree > + */ > + ext4_btree_init_search_path(&spath); > + searchrec = ext4_btree_search_key(root, key, &spath); > + if (!searchrec) > + /* record doesnt exist */ > + return -1; > + node = spath.node[0]; > + pos = spath.pos[0]; > + ext4_btree_trace("==Delete : rec 0x%llx found in node in block %lld " \ > + "- level %d pos %d\n", > + key->blockno, > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + pos); > + ext4_btree_delete_one(root, node, pos); > + if (pos == 0) { > + /* update parent key */ > + parent = spath.node[1]; > + p_pos = spath.pos[1]; > + key = ext4_btree_key_addr(&root->geo, node, 0); > + ext4_btree_set_key(&root->geo, parent, p_pos, key); > + } > + /* > + * If the node is root node, which we do not require minmum number of records, > + * then we are good now, exit > + */ > + if (ext4_btree_get_level(root->node) == 0) > + return 0; > + if (ext4_btree_get_numrecs(node) >= EXT4_BTREE_MIN_RECS) > + return 0; > + > + level = 0; > + while (level < tree_height && > + ext4_btree_get_numrecs(node) < EXT4_BTREE_MIN_RECS) { > + ret = ext4_btree_rotate(root, &spath, node, level); > + if (!ret) > + return 0; /* node can be rotated with sibling nodes */ > + > + if (level == 0) > + ret = ext4_btree_merge_leaf(root, &spath, node); > + else > + > + ret = ext4_btree_merge_index(root, &spath, > + node, level); > + if (ret) { > + /* err*/ > + return ret; > + } > + level ++; > + node = spath.node[level]; > + } > + return 0; > +} > + > + > +/* > + * Check if a btree is valid > + */ > + > +int > +ext4_btree_index_node_check(struct ext4_btree_root * root, > + struct ext4_btree_node *node) > +{ > + int i; > + int num_keys; > + struct ext4_btree_key *key, *lkey = NULL; > + struct ext4_btree_val *val; > + > + if (node == NULL) > + return -1; > + > + num_keys = ext4_btree_get_numkeys(node); > + if (root->node != node) { > + // non-root node > + if (num_keys < EXT4_BTREE_MIN_KEY_VAL_PAIRS || > + num_keys > EXT4_BTREE_MAX_KEY_VAL_PAIRS) { I think it's important to check !(num_keys > EXT4_BTREE_MAX_KEY_VAL_PAIRS) for root nodes? Also, there's no check of CRC, leftsib, rightsib, or blockno. > + ext4_btree_trace("Invalid numkeys %d in node %lld - " \ > + "level %d\n", > + num_keys, > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node)); > + return -2; > + } > + } > + > + for (i = 0; i < num_keys; i++) { > + key = ext4_btree_key_addr(&root->geo, node, i); > + val = ext4_btree_val_addr(&root->geo, node, i); > + if (lkey != NULL && ext4_btree_key_comp(&root->geo, lkey, key) >= 0) { > + ext4_btree_trace("Keys are not sorted in node %lld" \ > + "- level %d lkey %d 0x%llx key %d 0x%llx\n", > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node), > + i-1, key->blockno, i, key->blockno); > + return -3; > + } > + lkey = key; > + } > + return 0; > +} > + > + > +int > +ext4_btree_rec_node_check(struct ext4_btree_root * root, > + struct ext4_btree_node *node) > +{ > + int i; > + struct ext4_btree_rec *rec, *lrec = NULL; > + int num_recs; > + > + if (node == NULL) > + return -1; > + > + num_recs = ext4_btree_get_numrecs(node); > + if (root->node != node) { > + // non-root node > + if (num_recs < EXT4_BTREE_MIN_RECS || > + num_recs > EXT4_BTREE_MAX_RECS) { I think it's still necessary to check !(num_recs > EXT4_BTREE_MAX_RECS) for root nodes. Also, there's no check of CRC, leftsib, rightsib, or blockno. > + ext4_btree_trace("Invalid numrecs %d in node %lld - " \ > + "level %d\n", > + num_recs, > + ext4_btree_get_blockno(node), > + ext4_btree_get_level(node)); > + return -2; Wading in a bit here, but ... return -EUCLEAN? More generally, are these int returns supposed to be regular error codes? > + } > + } > + for (i = 0; i < num_recs; i++) { > + rec = ext4_btree_rec_addr(&root->geo, node, i); > + if (lrec != NULL && > + ext4_btree_key_comp(&root->geo, &lrec->key, &rec->key) >= 0) { > + ext4_btree_trace("Rec are not sorted in block 0x%llx" \ > + "lkey %d 0x%llx key %d 0x%llx \n", > + ext4_btree_get_blockno(node), > + i - 1, lrec->key.blockno, i, rec->key.blockno); > + return -3; > + } > + } > + return 0; > +} > + > +int > +ext4_btree_valid_check(struct ext4_btree_root *root) > +{ > + struct ext4_btree_search_path spath; 96 bytes on the stack, hm. I guess it's not likely to nest too many levels deep. > + struct ext4_btree_header * header; > + struct ext4_btree_node * node; > + struct ext4_btree_val * val; > + int level; > + int rootlevel; > + int pos; > + int numkeys; > + int result; > + > + if (root == NULL || root->node == NULL) { > + return 0; > + } > + /* check geo */ > + if (root->geo.header_len == 0 || > + root->geo.node_len == 0 || > + root->geo.key_len == 0 || > + root->geo.val_len == 0 || > + root->geo.rec_len == 0 || > + root->geo.max_pairs == 0 || > + root->geo.max_records == 0) > + { > + ext4_btree_trace("tree geo is invalid %d %d %d %d %d %d %d\n", > + root->geo.header_len, > + root->geo.node_len, > + root->geo.key_len, > + root->geo.val_len, > + root->geo.rec_len, > + root->geo.max_pairs, > + root->geo.max_records); > + return -1; > + } > + if (root->geo.max_pairs % 2 == 1 || root->geo.max_records % 2 == 1) { > + ext4_btree_trace("tree geo does not support odd pairs %d %d\n", Oh? I'm a little surprised by this requirement, since the header didn't say anything about it. (Also, seeing as those two values are known at compile time, this could be a compiletime_assert()?) > + root->geo.max_pairs, > + root->geo.max_records); > + return -1; > + } > + > + /* check tree */ > + ext4_btree_init_search_path(&spath); > + node = root->node; > + level = rootlevel = ext4_btree_get_level(node); > + pos = 0; Should we check level < 8 here? > + while (level >= 0) { > + spath.node[level] = node; > + spath.pos[level] = pos; > + header = ext4_btree_header_addr(node); > + if (header->btree_magic != REF_BTREE_MAGIC) { > + ext4_btree_trace("node 0x%p block %lld has no MAGIC stamp. level %d pos %d\n", > + node, ext4_btree_get_blockno(node), level, pos); > + return -1; > + } I think there needs to be a check for header->level and header->numkeys here. (Ok, enough for now.) --D > + if (level > 0) { > + if (pos == 0) { > + result = ext4_btree_index_node_check(root, > + node); > + if (result != 0) > + return result; > + } > + numkeys = ext4_btree_get_numkeys(node); > + if (pos == numkeys) { > + if (level == rootlevel) > + break; /* reach the end > + /* go to parent node */ > + level ++; > + node = spath.node[level]; > + pos = spath.pos[level] + 1; > + } > + else { > + /* go to next child node */ > + level --; > + val = ext4_btree_val_addr(&root->geo, > + node, pos); > + node = fs_get_btree_node(val->blockno); > + pos = 0; > + } > + } > + else { > + result = ext4_btree_rec_node_check(root, node); > + if (result != 0) > + return result; > + if (level == rootlevel) > + break; // reach the end > + /* go to parent node; */ > + level ++; > + node = spath.node[level]; > + pos = spath.pos[level] + 1; > + } > + } > + return 0; > +} > -- > 1.9.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html