On 06/23/2015 05:02 PM, Darrick J. Wong wrote: > On Mon, Jun 22, 2015 at 08:24:38PM -0700, mingming cao wrote: >> --- >> ext4_btree.c | 1356 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 1356 insertions(+) >> create mode 100644 ext4_btree.c >> >> diff --git a/ext4_btree.c b/ext4_btree.c >> new file mode 100644 >> index 0000000..baf253c >> --- /dev/null >> +++ b/ext4_btree.c >> @@ -0,0 +1,1356 @@ >> +/* >> + * copyright (C) 2015 Oracle. All rights reserved. > > (Nit-picking, but this should be a capital-C "Copyright") > Thanks! >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU General Public >> + * License v2 as published by the Free Software Foundation. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public >> + * License along with this program; if not, write to the >> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, >> + * Boston, MA 021110-1307, USA. >> + */ >> + >> + >> + >> + >> +#include "ext4_btree.h" >> + >> +/* >> + * Calculate offset of the n-th key in a btree node. >> + */ >> +static inline unsigned int >> +ext4_btree_key_offset(struct ext4_btree_geo *geo, unsigned int n) >> +{ >> + return geo->header_len + n * geo->key_len; > > Parentheses around the multiplicative terms would make this a little easier > to scan a line for which variables go together, i.e. > > return geo->header_len + (n * geo->key_len); > > (Still, a minor nit.) > no problem, that's been fixed now. >> +} >> + >> +/* >> + * Calculate offset of the n-th node pointer in a btree node. >> + */ >> +static inline unsigned int >> +ext4_btree_val_offset(struct ext4_btree_geo *geo, unsigned int n) >> +{ >> + return geo->header_len + >> + geo->max_pairs * geo->key_len + >> + n * geo->val_len; >> +} >> + >> +/* >> + * Calculate offset of the n-th record in a btree leaf node. >> + */ >> +static inline unsigned int >> +ext4_btree_rec_offset(struct ext4_btree_geo *geo, unsigned int n) >> +{ >> + return geo->header_len + n * geo->rec_len; >> +} >> + >> +/* >> + * Node header access functions >> + */ >> +static inline struct ext4_btree_header* >> +ext4_btree_header_addr(struct ext4_btree_node *block) >> +{ >> + return (struct ext4_btree_header *)block; >> +} >> + >> +static inline unsigned int >> +ext4_btree_get_numkeys(struct ext4_btree_node *node) >> +{ >> + return ext4_btree_header_addr(node)->btree_numkeys; >> +} >> + >> +static inline void >> +ext4_btree_update_numkeys(struct ext4_btree_node *node, unsigned int n) >> +{ >> + ext4_btree_header_addr(node)->btree_numkeys = n; >> +} >> + >> +static inline unsigned int >> +ext4_btree_get_numrecs(struct ext4_btree_node *node) >> +{ >> + return ext4_btree_header_addr(node)->btree_numkeys; >> +} >> + >> +static inline void >> +ext4_btree_update_numrecs(struct ext4_btree_node *node, unsigned int n) >> +{ >> + ext4_btree_header_addr(node)->btree_numkeys = n; >> +} >> + >> +static inline unsigned int >> +ext4_btree_get_level(struct ext4_btree_node *node) >> +{ >> + return ext4_btree_header_addr(node)->btree_level; >> +} >> + >> +static inline void >> +ext4_btree_update_level(struct ext4_btree_node *node, unsigned int level) >> +{ >> + ext4_btree_header_addr(node)->btree_level = level; >> +} >> + >> +static inline unsigned long long >> +ext4_btree_get_blockno(struct ext4_btree_node *node) >> +{ >> + return ext4_btree_header_addr(node)->btree_blockno; >> +} >> + >> +static inline void >> +ext4_btree_update_blockno(struct ext4_btree_node *node, >> + unsigned long long blockno) >> +{ >> + ext4_btree_header_addr(node)->btree_blockno = blockno; >> +} >> + >> +/* >> +* Get n-th key address from btree node >> +*/ >> +static struct ext4_btree_key* >> +ext4_btree_key_addr(struct ext4_btree_geo *geo, >> + struct ext4_btree_node * node, >> + unsigned int n) >> +{ >> + return (struct ext4_btree_key *) >> + ((unsigned char *)node + ext4_btree_key_offset(geo, n)); >> +} >> + >> +/* >> + * Set n-th key from btree node >> + */ >> +static void >> +ext4_btree_set_key(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node, >> + unsigned int n, >> + struct ext4_btree_key *key) >> +{ >> + struct ext4_btree_key *keyoffset = ext4_btree_key_addr(geo, node, n); >> + *keyoffset = *key; >> +} >> + >> +static void >> +ext4_btree_clear_key(struct ext4_btree_key *key) >> +{ >> + key->blockno = 0; >> +} >> + >> +/* >> + * Get n-th val address from btree node >> + */ >> +static struct ext4_btree_val* >> +ext4_btree_val_addr(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node, >> + unsigned int n) >> +{ >> + return (struct ext4_btree_val *) >> + ((unsigned char *)node + ext4_btree_val_offset(geo, n)); >> +} >> + >> +/* >> + * Set n-th val from btree node >> + */ >> +static void >> +ext4_btree_set_val(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node, >> + unsigned int n, >> + struct ext4_btree_val *val) >> +{ >> + struct ext4_btree_val *valaddr = ext4_btree_val_addr(geo, node, n); >> + *valaddr = *val; >> +} >> + >> +static void >> +ext4_btree_clear_val(struct ext4_btree_val *val) >> +{ >> + val->blockno = 0; >> +} >> + >> +/* >> + * Get n-th record address from btree node >> + */ >> +static struct ext4_btree_rec* >> +ext4_btree_rec_addr(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node, >> + unsigned int n) >> +{ >> + return (struct ext4_btree_rec *) >> + ((unsigned char *)node + ext4_btree_rec_offset(geo, n)); >> +} >> + >> + >> +/* >> + * Set n-th value from btree node >> + */ >> +static void >> +ext4_btree_set_rec(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node, >> + unsigned int n, >> + struct ext4_btree_rec *rec) >> +{ >> + struct ext4_btree_rec *rec_offset = ext4_btree_rec_addr(geo, node, n); >> + *rec_offset = *rec; >> +} >> + >> +static void >> +ext4_btree_clear_rec(struct ext4_btree_rec *rec) >> +{ >> + rec->key.blockno = 0; >> + rec->len = 0; >> + rec->ref_count = 0; >> +} >> + >> +/* >> + * Initialize btree root node >> + */ >> + >> +void >> +ext4_btree_root_init(struct ext4_btree_root *root, >> + struct ext4_btree_geo *geo) >> +{ >> + root->node = NULL; >> + root->geo = (*geo); >> +} >> + >> +/* >> + * Initialize ref btree root node >> + */ >> +void >> +ext4_ref_btree_init(struct ext4_btree_root *root) >> +{ >> + ext4_btree_root_init(root, &ref_btree_geo); >> +} >> + >> +/* >> + * Allocate a btree node >> + */ >> +struct ext4_btree_node * >> +ext4_btree_node_alloc() >> +{ >> + return fs_alloc_btree_node(); > > I'm assuming this is defined somewhere in your test harness? > yep, the next code release I will have the full in-kernel code that implement this. >> +} >> + >> +/* >> + * Free a btree node >> + */ >> + >> +void >> +ext4_btree_node_free( struct ext4_btree_node *node) >> +{ >> + fs_free_btree_node(node); >> +} >> + >> +/* >> + * Compare two btree keys. >> + * Return 1 if key1 > key2. >> + * Return 0 if key1 == key2. >> + * Return -1 if key1 < key2. >> + */ >> +int >> +ext4_btree_key_comp(struct ext4_btree_geo *geo, >> + struct ext4_btree_key *key1, >> + struct ext4_btree_key *key2) >> +{ >> + if (key1->blockno < key2->blockno) >> + return -1; >> + else if (key1->blockno > key2->blockno) >> + return 1; >> + else >> + return 0; >> +} >> + >> +/* >> + * If specified key within record' range >> + * Return -1 if key < record's key >> + * Return 0 if record has the key >> + * Return 1 if record's key + len < key >> + */ >> +int >> +ext4_btree_rec_comp(struct ext4_btree_geo *geo, >> + struct ext4_btree_key *key, >> + struct ext4_btree_rec *rec) >> +{ >> + if (key->blockno < rec->key.blockno) >> + return -1; >> + else if ((rec->key.blockno + rec->len) <= key->blockno) >> + return 1; >> + else >> + return 0; >> +} >> + >> +/* >> + * Check if the given record has this key >> + * Return 1 if record has this key within range >> + * Return 0 if not >> + */ >> +static inline int >> +ext4_btree_rec_has_key(struct ext4_btree_geo *geo, >> + struct ext4_btree_rec *rec, >> + struct ext4_btree_key *key) >> +{ >> + return ((rec->key.blockno <= key->blockno) && >> + ((rec->key.blockno + rec->len) > key->blockno)); >> +} >> + >> +static inline void >> +ext4_btree_set_search_path(struct ext4_btree_search_path * spath, >> + int level, >> + struct ext4_btree_node * node, >> + int pos) >> +{ >> + if (spath) { >> + spath->node[level] = node; >> + spath->pos[level] = pos; >> + } >> +} >> + >> +/* define the btree layout for refcount btree */ >> +struct ext4_btree_geo ref_btree_geo = { >> + EXT4_BTREE_NODESIZE, >> + EXT4_BTREE_HEADER_SIZE, >> + EXT4_BTREE_KEY_SIZE, >> + EXT4_BTREE_VAL_SIZE, >> + EXT4_BTREE_REC_SIZE, >> + EXT4_BTREE_MAX_KEY_VAL_PAIRS, >> + EXT4_BTREE_MAX_RECS >> +}; >> + >> +/* remember the index btree node on the search path */ >> +struct ext4_btree_search_path { >> + struct ext4_btree_node * node[EXT4_BTREE_MAX_LEVELS]; >> + int pos [EXT4_BTREE_MAX_LEVELS]; >> +}; >> + >> + >> +/* >> + * Initialize the search path >> + */ >> +void >> +ext4_btree_init_search_path (struct ext4_btree_search_path *spath) >> +{ >> + int i; >> + for (i=0; i< EXT4_BTREE_MAX_LEVELS; i++) { >> + spath->node[i] = NULL; >> + spath->pos[i] = EXT4_BTREE_MAX_KEY_VAL_PAIRS; >> + } >> +} >> + >> + >> +/* >> + * Debug function to print out the whole tree >> + */ >> + >> +void >> +ext4_btree_rec_node_print(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node) >> +{ >> + int i; >> + struct ext4_btree_rec *rec; >> + int num_recs; >> + >> + if (node == NULL) >> + return; >> + num_recs = ext4_btree_get_numrecs(node); >> + ext4_btree_trace("==rec== node in block %lld - level %d numrecs %d\n", >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + ext4_btree_get_numrecs(node)); >> + for (i = 0; i < num_recs; i++) { >> + rec = ext4_btree_rec_addr(geo, node, i); >> + ext4_btree_trace("rec%d key 0x%llx len 0x%x refcount %d\n", >> + i, rec->key.blockno, rec->len, rec->ref_count); >> + } >> +} >> + >> +void >> +ext4_btree_index_node_print(struct ext4_btree_geo *geo, >> + struct ext4_btree_node *node) >> +{ >> + int i; >> + int num_keys; >> + struct ext4_btree_key *key; >> + struct ext4_btree_val *val; >> + >> + num_keys = ext4_btree_get_numkeys(node); >> + ext4_btree_trace("--key-- node in block %lld - level %d numkeys %d\n", >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + ext4_btree_get_numkeys(node)); >> + for (i = 0; i < num_keys; i++) { >> + key = ext4_btree_key_addr(geo, node, i); >> + val = ext4_btree_val_addr(geo, node, i); >> + ext4_btree_trace("pair%d key 0x%llx val 0x%llx\n", >> + i, key->blockno, val->blockno); >> + } >> +} >> + >> +void >> +ext4_btree_print(struct ext4_btree_root *root) >> +{ >> + struct ext4_btree_search_path spath; >> + struct ext4_btree_node * node; >> + struct ext4_btree_val * val; >> + int level; >> + int rootlevel; >> + int pos; >> + int numkeys; >> + >> + if (root == NULL || root->node == NULL) { >> + ext4_btree_trace("Empty tree\n"); >> + return; >> + } >> + >> + ext4_btree_trace("Btree Details:\n\n"); >> + ext4_btree_init_search_path(&spath); >> + node = root->node; >> + level = rootlevel = ext4_btree_get_level(node); >> + pos = 0; >> + while (level >= 0) { >> + spath.node[level] = node; >> + spath.pos[level] = pos; >> + if (level > 0) { >> + if (pos == 0) >> + ext4_btree_index_node_print(&root->geo, node); >> + numkeys = ext4_btree_get_numkeys(node); >> + if (pos == numkeys) { >> + if (level == rootlevel) >> + break; /* reach the end >> + /* go to parent node */ >> + level ++; >> + node = spath.node[level]; >> + pos = spath.pos[level] + 1; >> + } >> + else { >> + /* go to next child node */ >> + level --; >> + val = ext4_btree_val_addr(&root->geo, >> + node, pos); >> + node = fs_get_btree_node(val->blockno); >> + pos = 0 ; >> + } >> + } >> + else { >> + ext4_btree_rec_node_print(&root->geo, node); >> + if (level == rootlevel) >> + break; /* reach the end >> + /* go to parent node; */ >> + level ++; >> + node = spath.node[level]; >> + pos = spath.pos[level] + 1; >> + } >> + } >> + ext4_btree_trace("\n"); >> +} >> + >> + >> +/* >> + * Lookup for a record contain the specified key in btree >> + * Return NULL if the key is not found >> + */ >> +struct ext4_btree_rec* >> +ext4_btree_search_key(struct ext4_btree_root *root, >> + struct ext4_btree_key *key, >> + struct ext4_btree_search_path * spath) >> +{ >> + unsigned int i; >> + int level; >> + struct ext4_btree_node *node; >> + struct ext4_btree_key *tmp_key; >> + struct ext4_btree_val *tmp_val; >> + struct ext4_btree_geo *geo; >> + struct ext4_btree_rec *rec = NULL; >> + >> + if (root == NULL || root->node == NULL) >> + return NULL; >> + /* Search through the key-val index nodes. */ >> + node = root->node; >> + geo = &root->geo; >> + level = ext4_btree_get_level(node); >> + while (level > 0) { >> + for (i = 0; i < ext4_btree_get_numkeys(node)-1; i++) { >> + tmp_key = ext4_btree_key_addr(geo, node, i + 1); >> + if (ext4_btree_key_comp(geo, key, tmp_key) < 0) >> + break; >> + } > > If the keys and records are stored in sorted order, could you bsearch here > instead of linear scanning? Granted the difference might be inconsequential > for the ~252 records in a 4K node, but for a 64k node that's a linear scan > of ~4092 records. > > This goes for all the linear scans I see. Agree. Thanks. > >> + ext4_btree_set_search_path(spath, level, node, i); >> + tmp_val = ext4_btree_val_addr(geo, node, i); >> + node = fs_get_btree_node(tmp_val->blockno); >> + /* read failure*/ >> + if (node == NULL) >> + return NULL; > > I wonder if there ought to be a facility for returning integer error codes > and passing in a **node (as an out param) instead of using NULL to cover > "not found" and "badness happened". > s >> + level --; >> + } >> + /* Search the leaf node */ >> + assert(ext4_btree_get_level(node) == 0); >> + rec = ext4_btree_rec_addr(geo, node, 0); >> + i = 0; >> + while (ext4_btree_rec_comp(geo, key, rec) > 0) { >> + i++; >> + if (i >= ext4_btree_get_numkeys(node)) { >> + ext4_btree_set_search_path(spath, 0, node, i); >> + return NULL; >> + } >> + rec = ext4_btree_rec_addr(geo, node, i); >> + } >> + ext4_btree_set_search_path(spath, 0, node, i); >> + if (ext4_btree_rec_comp(geo, key, rec) == 0) >> + return rec; >> + else >> + return NULL; >> +} >> + >> +/* >> + * Lookup for a record contain the specified key in btree >> + * Return NULL if the key is not found >> + */ >> +struct ext4_btree_rec* >> +ext4_btree_lookup(struct ext4_btree_root *root, struct ext4_btree_key *key) >> +{ >> + return ext4_btree_search_key(root, key, NULL); >> +} >> + >> +/* >> + * Insert a rec into a leaf node at specifid position >> + */ >> +void >> +ext4_btree_node_insert_in_leaf(struct ext4_btree_root *root, >> + struct ext4_btree_node *node, >> + struct ext4_btree_rec *rec, >> + int pos) >> +{ >> + int numrecs; >> + int i; >> + struct ext4_btree_rec *tmprec; >> + >> + numrecs= ext4_btree_get_numrecs(node); >> + for (i = numrecs; i > pos;i--) { >> + tmprec = ext4_btree_rec_addr(&root->geo, node, i-1); >> + ext4_btree_set_rec(&root->geo, node, i,tmprec); >> + } > > memmove? Yeah, will change. > >> + ext4_btree_set_rec(&root->geo, node, pos, rec); >> + ext4_btree_update_numrecs(node,numrecs+1); >> +} >> + >> +/* >> + * Split a leaf node into two >> + */ >> +struct ext4_btree_node * >> +ext4_btree_split_leaf(struct ext4_btree_root *root, >> + struct ext4_btree_node *node) >> +{ >> + struct ext4_btree_node *rnode; >> + struct ext4_btree_rec *tmprec; >> + unsigned int i; >> + >> + rnode = ext4_btree_node_alloc(); >> + if (!rnode) { >> + /* Cant allocate a new node, ERR*/ >> + return NULL; >> + } >> + for (i = EXT4_BTREE_MAX_RECS/2; i < EXT4_BTREE_MAX_RECS;i++) { >> + tmprec = ext4_btree_rec_addr(&root->geo, node, i); >> + ext4_btree_set_rec(&root->geo, rnode, >> + (i-EXT4_BTREE_MAX_RECS/2), tmprec); >> + ext4_btree_clear_rec(tmprec); > > memcpy/memset? > nod >> + } >> + ext4_btree_update_numrecs(rnode,EXT4_BTREE_MAX_RECS/2); >> + ext4_btree_update_numrecs(node, EXT4_BTREE_MAX_RECS/2); >> + return rnode; >> +} >> + >> +/* >> + * Split a index node into two >> + */ >> +struct ext4_btree_node * >> +ext4_btree_split_index(struct ext4_btree_root *root, >> + struct ext4_btree_node *node) >> +{ >> + struct ext4_btree_node *rnode; >> + struct ext4_btree_key *tmp_key; >> + struct ext4_btree_val *tmp_val; >> + unsigned int i; >> + >> + rnode = ext4_btree_node_alloc(); >> + if (!rnode) { >> + /* Cant allocate a new node, ERR*/ >> + return NULL; >> + } >> + /* split key-val pairs between old node and new node */ >> + for (i = EXT4_BTREE_MAX_KEY_VAL_PAIRS/2; >> + i < EXT4_BTREE_MAX_KEY_VAL_PAIRS; >> + i++) { >> + tmp_key = ext4_btree_key_addr(&root->geo, node, i); >> + ext4_btree_set_key(&root->geo, rnode, >> + (i-EXT4_BTREE_MAX_KEY_VAL_PAIRS/2),tmp_key); >> + ext4_btree_clear_key(tmp_key); >> + tmp_val = ext4_btree_val_addr(&root->geo, node, i); >> + ext4_btree_set_val(&root->geo, rnode, >> + (i-EXT4_BTREE_MAX_KEY_VAL_PAIRS/2),tmp_val); >> + ext4_btree_clear_val(tmp_val); >> + } >> + /* set level and numkeys in new node */ >> + ext4_btree_update_level(rnode, ext4_btree_get_level(node)); >> + ext4_btree_update_numkeys(rnode,EXT4_BTREE_MAX_KEY_VAL_PAIRS/2); >> + /* set numkeys in old node */ >> + ext4_btree_update_numkeys(node, EXT4_BTREE_MAX_KEY_VAL_PAIRS/2); >> + return rnode; >> +} >> + >> + >> +/* >> + * Insert a key-val pair into a index node at specified position >> + */ >> +void >> +ext4_btree_node_insert_in_index(struct ext4_btree_root *root, >> + struct ext4_btree_node *node, >> + struct ext4_btree_key *key, >> + struct ext4_btree_val *val, >> + int pos) >> +{ >> + int i; >> + int numkeys; >> + struct ext4_btree_key *pkey; >> + struct ext4_btree_val *pval; >> + >> + numkeys = ext4_btree_get_numkeys(node); >> + for (i = numkeys - 1; i >= pos; i--) { >> + pkey = ext4_btree_key_addr(&root->geo, node, i); >> + ext4_btree_set_key(&root->geo, node, i + 1, pkey); >> + pval = ext4_btree_val_addr(&root->geo, node, i); >> + ext4_btree_set_val(&root->geo, node, i + 1, pval); >> + } >> + ext4_btree_set_key(&root->geo, node, pos, key); >> + ext4_btree_set_val(&root->geo, node, pos, val); >> + ext4_btree_update_numkeys(node, numkeys + 1); >> +} >> + >> +/* >> + * Grow tree by one more level. Return the new root node. >> + */ >> +struct ext4_btree_node * >> +ext4_btree_grow(struct ext4_btree_root *root, >> + struct ext4_btree_node *lnode, >> + struct ext4_btree_node *rnode, >> + int level) >> +{ >> + struct ext4_btree_node * newroot; >> + struct ext4_btree_key * key; >> + struct ext4_btree_val val; >> + >> + newroot = ext4_btree_node_alloc(); >> + if (!newroot) { >> + /* Cant allocate a new node, ERR*/ >> + return NULL; >> + } >> + >> + ext4_btree_update_level(newroot, level); >> + >> + key = ext4_btree_key_addr(&root->geo, lnode, 0); >> + ext4_btree_set_key(&root->geo, newroot, 0, key); >> + val.blockno = ext4_btree_get_blockno(lnode); >> + ext4_btree_set_val(&root->geo, newroot, 0, &val); >> + >> + key = ext4_btree_key_addr(&root->geo, rnode, 0); >> + ext4_btree_set_key(&root->geo, newroot, 1, key); >> + val.blockno = ext4_btree_get_blockno(rnode); >> + ext4_btree_set_val(&root->geo, newroot, 1, &val); >> + >> + ext4_btree_update_numkeys(newroot, 2); >> + >> + return newroot; >> + >> +} >> + >> +/* >> + * Insert a record to the btree >> + */ >> +int >> +ext4_btree_insert(struct ext4_btree_root *root, struct ext4_btree_rec *rec) >> +{ >> + unsigned int level; >> + struct ext4_btree_node *node, *rnode, *newroot; >> + struct ext4_btree_key *key, *rkey; >> + struct ext4_btree_val rval; >> + struct ext4_btree_search_path spath; >> + int pos, full, numkeys; >> + struct ext4_btree_rec *searchrec; >> + >> + if (root->node == NULL) { >> + /* empty tree */ >> + node = ext4_btree_node_alloc(); >> + if (node == NULL) >> + return -1; >> + root->node = node; >> + ext4_btree_update_level(root->node, 0); >> + ext4_btree_trace( >> + "==rec 0x%llx will be insert in node in block %lld "\ >> + "- level %d pos %d\n", >> + rec->key.blockno, >> + ext4_btree_get_blockno(root->node), >> + ext4_btree_get_level(root->node), >> + 0); >> + >> + ext4_btree_node_insert_in_leaf(root, root->node, rec, 0); >> + return 0; >> + } >> + >> + /* >> + * First we search if the key already present, >> + * In the search process, all levels node pointer and position are stored >> + * in search pointer for later use >> + * there is no duplicated key allowed in the tree >> + */ >> + ext4_btree_init_search_path(&spath); >> + key = &rec->key; >> + searchrec = ext4_btree_search_key(root, key, &spath); >> + >> + if (searchrec) { >> + node = spath.node[0]; >> + pos = spath.pos[0]; >> + ext4_btree_trace("==rec 0x%llx found in node in block %lld " \ >> + "- level %d pos %d\n", >> + rec->key.blockno, >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + pos); >> + return -1; >> + } >> + level = 0; >> + node = spath.node[0]; >> + pos = spath.pos[0]; >> + full = ext4_btree_get_numkeys(node) == EXT4_BTREE_MAX_RECS; >> + ext4_btree_trace( >> + "==rec 0x%llx will be insert in node in block %lld "\ >> + "- level %d pos %d\n", >> + rec->key.blockno, >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + pos); >> + >> + if (!full) { >> + ext4_btree_node_insert_in_leaf(root, node, rec, pos); >> + while (pos == 0 && >> + (++level <= ext4_btree_get_level(root->node))) { >> + /* update parent key */ >> + node = spath.node[level]; >> + pos = spath.pos[level]; >> + ext4_btree_set_key(&root->geo, node, pos, key); >> + } >> + return 0; >> + } >> + >> + /* check if B tree root is full and level reach the max */ >> + if ((ext4_btree_get_level(root->node) == EXT4_BTREE_MAX_LEVELS - 1) && >> + (ext4_btree_get_numkeys(root->node) >> + == EXT4_BTREE_MAX_KEY_VAL_PAIRS)) { >> + ext4_btree_trace("Tree reach max level, no more insert\n"); >> + return -1; // Tree reach the max level >> + } >> + >> + /* leaf node is full, split it to make space to insert new rec*/ >> + numkeys = ext4_btree_get_numkeys(node); >> + rnode = ext4_btree_split_leaf(root, node); >> + if (rnode == NULL) >> + return -1; >> + if (pos > EXT4_BTREE_MAX_RECS/2) >> + ext4_btree_node_insert_in_leaf(root, rnode, rec, >> + pos - EXT4_BTREE_MAX_RECS/2); >> + else >> + ext4_btree_node_insert_in_leaf(root, node, rec, pos); >> + >> + /* split index nodes if full, all the way up to root */ >> + while (level < ext4_btree_get_level(root->node)) { >> + /* Pick up the first key*/ >> + rkey = ext4_btree_key_addr(&root->geo, rnode, 0); >> + rval.blockno = ext4_btree_get_blockno(rnode); >> + level ++; >> + node = spath.node[level]; >> + pos = spath.pos[level]; >> + numkeys = ext4_btree_get_numkeys(node); >> + full = (numkeys == EXT4_BTREE_MAX_KEY_VAL_PAIRS); >> + if (!full) { >> + ext4_btree_node_insert_in_index(root, node, rkey, >> + &rval, pos + 1); >> + break; >> + } >> + rnode = ext4_btree_split_index(root, node); >> + if (rnode == NULL) >> + return -1; >> + if (pos > EXT4_BTREE_MAX_KEY_VAL_PAIRS/2 ) { >> + ext4_btree_node_insert_in_index(root, rnode, rkey, >> + &rval, pos - EXT4_BTREE_MAX_KEY_VAL_PAIRS/2 + 1); >> + } else { >> + ext4_btree_node_insert_in_index(root, node, rkey, >> + &rval, pos + 1); >> + } >> + } >> + if (level == ext4_btree_get_level(node) && full) { >> + /* If root is split, grow tree by one more level and assign new root */ >> + newroot = ext4_btree_grow(root, node, rnode, level + 1); >> + if (newroot != NULL) { >> + root->node = newroot; >> + } >> + } >> + return 0; >> +} >> + >> + >> +/* >> + * Delete one record from leaf node >> + */ >> +void >> +ext4_btree_delete_one(struct ext4_btree_root *root, >> + struct ext4_btree_node *node, >> + int pos) >> +{ >> + unsigned int i; >> + struct ext4_btree_rec* tmprec; >> + unsigned int numrecs; >> + >> + numrecs= ext4_btree_get_numrecs(node); >> + for (i = pos; i< numrecs - 1; i++) { >> + tmprec = ext4_btree_rec_addr(&root->geo, node, i + 1); >> + ext4_btree_set_rec(&root->geo, node, i, tmprec); >> + } >> + ext4_btree_update_numrecs(node, numrecs - 1); >> +} >> + >> +/* >> + * Delete one key/val pair from index node >> + */ >> + >> +void >> +ext4_btree_delete_one_keyval(struct ext4_btree_root *root, >> + struct ext4_btree_node *node, >> + int pos) >> +{ >> + unsigned int i; >> + struct ext4_btree_key* key; >> + struct ext4_btree_val* val; >> + unsigned int numkeys; >> + >> + numkeys= ext4_btree_get_numkeys(node); >> + for (i = pos; i< numkeys - 1; i++) { >> + key = ext4_btree_key_addr(&root->geo, node, i + 1); >> + val = ext4_btree_val_addr(&root->geo, node, i + 1); >> + ext4_btree_set_key(&root->geo, node, i, key); >> + ext4_btree_set_val(&root->geo, node, i, val); >> + } >> + ext4_btree_update_numkeys(node, numkeys - 1); >> + >> +} >> + >> + >> +/* >> + * Borrow one record or key/val pair from left sibling >> + */ >> +void >> +ext4_btree_borrow_one_left (struct ext4_btree_root *root, >> + struct ext4_btree_node *parent, >> + struct ext4_btree_node *lnode, >> + struct ext4_btree_node *rnode, >> + int pos, >> + int level) >> +{ >> + struct ext4_btree_rec *rec; >> + struct ext4_btree_key *key; >> + struct ext4_btree_val *val; >> + int numrecs; >> + int numkeys; >> + >> + if (level == 0) { >> + numrecs = ext4_btree_get_numrecs(lnode); >> + rec = ext4_btree_rec_addr(&root->geo, lnode, numrecs - 1); >> + key = &rec->key; >> + ext4_btree_node_insert_in_leaf(root, rnode, rec, 0); >> + ext4_btree_delete_one(root, lnode, numrecs - 1); >> + } else { >> + numkeys = ext4_btree_get_numkeys(lnode); >> + key = ext4_btree_key_addr(&root->geo, lnode, numkeys - 1); >> + val = ext4_btree_val_addr(&root->geo, lnode, numkeys - 1); >> + ext4_btree_node_insert_in_index(root, rnode, key, val, 0); >> + ext4_btree_delete_one_keyval(root, lnode, numkeys - 1); >> + } >> + /* update parent node key */ >> + ext4_btree_set_key(&root->geo, parent, pos, key); >> + >> +} >> + >> +/* >> + * Borrow one record or key/val pair from right sibling >> + */ >> +void >> +ext4_btree_borrow_one_right (struct ext4_btree_root *root, >> + struct ext4_btree_node *parent, >> + struct ext4_btree_node *lnode, >> + struct ext4_btree_node *rnode, >> + int pos, >> + int level) >> +{ >> + struct ext4_btree_rec *rec; >> + struct ext4_btree_key *key; >> + struct ext4_btree_val *val; >> + int numrecs; >> + int numkeys; >> + >> + if (level == 0) { >> + rec = ext4_btree_rec_addr(&root->geo, rnode, 0); >> + key = &rec->key; >> + numrecs = ext4_btree_get_numrecs(lnode); >> + ext4_btree_node_insert_in_leaf(root, lnode, rec, numrecs); >> + ext4_btree_delete_one(root, rnode, 0); >> + } else { >> + key = ext4_btree_key_addr(&root->geo, rnode, 0); >> + val = ext4_btree_val_addr(&root->geo, rnode, 0); >> + numkeys = ext4_btree_get_numkeys(lnode); >> + ext4_btree_node_insert_in_index(root, lnode, key, val, numkeys); >> + ext4_btree_delete_one_keyval(root, rnode, 0); >> + } >> + /* update parent node key */ >> + ext4_btree_set_key(&root->geo, parent, pos + 1, key); >> +} >> + >> +int >> +ext4_btree_rotate(struct ext4_btree_root *root, >> + struct ext4_btree_search_path *spath, >> + struct ext4_btree_node *node, int level) >> +{ >> + struct ext4_btree_val * val; >> + struct ext4_btree_node *parent, *lnode, *rnode; >> + int pos = 0; >> + int numkeys; >> + >> + parent = spath->node[level + 1]; >> + pos = spath->pos[level + 1]; >> + >> + if (pos > 0) { >> + /* check the left sibling first*/ >> + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); > > node->node_header.leftsib? > Sure. I had the search logic before I added the sibling field. Will eventually update this part using the left/right sibling. >> + lnode = fs_get_btree_node(val->blockno); >> + if (lnode) { >> + numkeys = ext4_btree_get_numkeys(lnode); >> + if (numkeys >= EXT4_BTREE_MIN_RECS + 1) { >> + /* we could shift a record from left sibling to the node*/ >> + ext4_btree_borrow_one_left(root, parent, lnode, >> + node, pos, level); >> + return 0; >> + } >> + } >> + } >> + >> + numkeys = ext4_btree_get_numkeys(parent); >> + if (pos < numkeys -1) { >> + val = ext4_btree_val_addr(&root->geo, parent, pos + 1); >> + rnode = fs_get_btree_node(val->blockno); >> + if (rnode) { >> + numkeys = ext4_btree_get_numkeys(rnode); >> + if (numkeys >= EXT4_BTREE_MIN_RECS + 1) { >> + /* we could shift a record from left sibling to the node*/ >> + ext4_btree_borrow_one_right(root, parent, node, >> + rnode, pos, level); >> + return 0; >> + } >> + } >> + } >> + >> + return -1; >> + >> +} >> + >> +/* >> + * Merge leaf nodes >> + */ >> + >> +int >> +ext4_btree_merge_leaf(struct ext4_btree_root *root, >> + struct ext4_btree_search_path *spath, >> + struct ext4_btree_node *node) >> +{ >> + int num, lnum, rnum; >> + struct ext4_btree_node * parent, *lnode, *rnode; >> + int i, pos; >> + struct ext4_btree_rec *rec; >> + struct ext4_btree_val *val; >> + >> + parent = spath->node[1]; >> + pos = spath->pos[1]; >> + >> + num = ext4_btree_get_numrecs(node); >> + >> + /* try to merge left sibling first */ >> + if (pos > 0) { >> + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); >> + lnode = fs_get_btree_node(val->blockno); >> + if (!lnode) { >> + return -1; >> + /* err!*/ >> + } >> + >> + lnum = ext4_btree_get_numrecs(lnode); >> + >> + for (i = 0; i < num; i++) { >> + rec = ext4_btree_rec_addr(&root->geo, node, i); >> + ext4_btree_node_insert_in_leaf(root, lnode, rec, lnum+i); >> + } >> + // delete parent key-val pair >> + ext4_btree_delete_one_keyval(root, parent, pos); >> + ext4_btree_node_free(node); >> + return 0; >> + } >> + /* then try to merge with right sibling */ >> + val = ext4_btree_val_addr(&root->geo, parent, pos + 1); >> + rnode = fs_get_btree_node(val->blockno); >> + if (!rnode) { >> + return -1; >> + /* err!*/ >> + } >> + >> + rnum = ext4_btree_get_numrecs(rnode); >> + >> + for (i = 0; i < rnum; i++) { >> + rec = ext4_btree_rec_addr(&root->geo, rnode, i); >> + ext4_btree_node_insert_in_leaf(root, node, rec, num+i); >> + } >> + // delete parent key-val pair >> + ext4_btree_delete_one_keyval(root, parent, pos + 1); >> + ext4_btree_node_free(rnode); >> + return 0; >> +} >> + >> + >> +/* >> + * Merge index nodes >> + */ >> + >> +int >> +ext4_btree_merge_index(struct ext4_btree_root *root, >> + struct ext4_btree_search_path *spath, >> + struct ext4_btree_node *node, int level) >> + >> +{ >> + int num, lnum, rnum, pnum; >> + struct ext4_btree_node * parent, *lnode, *rnode; >> + int i, pos; >> + struct ext4_btree_key *key; >> + struct ext4_btree_val *val; >> + >> + parent = spath->node[level + 1]; >> + pos = spath->pos[level + 1]; >> + >> + num = ext4_btree_get_numkeys(node); >> + >> + /* try to merge left sibling first */ >> + if (pos > 0) { >> + val = ext4_btree_val_addr(&root->geo, parent, pos - 1); >> + lnode = fs_get_btree_node(val->blockno); >> + if (!lnode) { >> + /* err!*/ >> + return -1; >> + } >> + >> + lnum = ext4_btree_get_numkeys(lnode); >> + >> + for (i = 0; i < num; i++) { >> + key = ext4_btree_key_addr(&root->geo, node, i); >> + val = ext4_btree_val_addr(&root->geo, node, i); >> + ext4_btree_node_insert_in_index(root, lnode, key, >> + val, lnum + i); >> + } >> + // delete parent key-val pair >> + ext4_btree_delete_one_keyval(root, parent, pos); >> + ext4_btree_node_free(node); >> + return 0; >> + } >> + pnum = ext4_btree_get_numkeys(parent); >> + if (pnum > 1) { >> + /* then try to merge with right sibling */ >> + val = ext4_btree_val_addr(&root->geo, parent, 1); /* pos is always 0*/ >> + rnode = fs_get_btree_node(val->blockno); >> + if (!rnode) { >> + return -1; >> + /* err!*/ >> + } >> + >> + rnum = ext4_btree_get_numkeys(rnode); >> + >> + for (i = 0; i < rnum; i++) { >> + key = ext4_btree_key_addr(&root->geo, rnode, i); >> + val = ext4_btree_val_addr(&root->geo, rnode, i); >> + ext4_btree_node_insert_in_index(root, node, key, >> + val, num + i); >> + } >> + // delete parent key-val pair >> + ext4_btree_delete_one_keyval(root, parent, pos + 1); >> + ext4_btree_node_free(rnode); >> + ext4_btree_print(root); >> + } else{ >> + /* shrink level */ >> + ext4_btree_node_free(root->node); >> + root->node = node; >> + ext4_btree_print(root); >> + } >> + >> + return 0; >> +} >> + >> + >> +/* >> + * Delete a record from the btree >> + */ >> +int >> +ext4_btree_delete(struct ext4_btree_root *root, struct ext4_btree_key *key) >> +{ >> + struct ext4_btree_node *node, *parent; >> + struct ext4_btree_search_path spath; >> + int pos, p_pos; >> + struct ext4_btree_rec *searchrec; >> + int level; >> + int tree_height = ext4_btree_get_level(root->node); >> + int ret; >> + >> + /* >> + * First we search if the key already present, >> + * In the search process, all levels node pointer and position are stored >> + * in search pointer for later use >> + * there is no duplicated key allowed in the tree >> + */ >> + ext4_btree_init_search_path(&spath); >> + searchrec = ext4_btree_search_key(root, key, &spath); >> + if (!searchrec) >> + /* record doesnt exist */ >> + return -1; >> + node = spath.node[0]; >> + pos = spath.pos[0]; >> + ext4_btree_trace("==Delete : rec 0x%llx found in node in block %lld " \ >> + "- level %d pos %d\n", >> + key->blockno, >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + pos); >> + ext4_btree_delete_one(root, node, pos); >> + if (pos == 0) { >> + /* update parent key */ >> + parent = spath.node[1]; >> + p_pos = spath.pos[1]; >> + key = ext4_btree_key_addr(&root->geo, node, 0); >> + ext4_btree_set_key(&root->geo, parent, p_pos, key); >> + } >> + /* >> + * If the node is root node, which we do not require minmum number of records, >> + * then we are good now, exit >> + */ >> + if (ext4_btree_get_level(root->node) == 0) >> + return 0; >> + if (ext4_btree_get_numrecs(node) >= EXT4_BTREE_MIN_RECS) >> + return 0; >> + >> + level = 0; >> + while (level < tree_height && >> + ext4_btree_get_numrecs(node) < EXT4_BTREE_MIN_RECS) { >> + ret = ext4_btree_rotate(root, &spath, node, level); >> + if (!ret) >> + return 0; /* node can be rotated with sibling nodes */ >> + >> + if (level == 0) >> + ret = ext4_btree_merge_leaf(root, &spath, node); >> + else >> + >> + ret = ext4_btree_merge_index(root, &spath, >> + node, level); >> + if (ret) { >> + /* err*/ >> + return ret; >> + } >> + level ++; >> + node = spath.node[level]; >> + } >> + return 0; >> +} >> + >> + >> +/* >> + * Check if a btree is valid >> + */ >> + >> +int >> +ext4_btree_index_node_check(struct ext4_btree_root * root, >> + struct ext4_btree_node *node) >> +{ >> + int i; >> + int num_keys; >> + struct ext4_btree_key *key, *lkey = NULL; >> + struct ext4_btree_val *val; >> + >> + if (node == NULL) >> + return -1; >> + >> + num_keys = ext4_btree_get_numkeys(node); >> + if (root->node != node) { >> + // non-root node >> + if (num_keys < EXT4_BTREE_MIN_KEY_VAL_PAIRS || >> + num_keys > EXT4_BTREE_MAX_KEY_VAL_PAIRS) { > > I think it's important to check !(num_keys > EXT4_BTREE_MAX_KEY_VAL_PAIRS) > for root nodes? > > Also, there's no check of CRC, leftsib, rightsib, or blockno. > >> + ext4_btree_trace("Invalid numkeys %d in node %lld - " \ >> + "level %d\n", >> + num_keys, >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node)); >> + return -2; >> + } >> + } >> + >> + for (i = 0; i < num_keys; i++) { >> + key = ext4_btree_key_addr(&root->geo, node, i); >> + val = ext4_btree_val_addr(&root->geo, node, i); >> + if (lkey != NULL && ext4_btree_key_comp(&root->geo, lkey, key) >= 0) { >> + ext4_btree_trace("Keys are not sorted in node %lld" \ >> + "- level %d lkey %d 0x%llx key %d 0x%llx\n", >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node), >> + i-1, key->blockno, i, key->blockno); >> + return -3; >> + } >> + lkey = key; >> + } >> + return 0; >> +} >> + >> + >> +int >> +ext4_btree_rec_node_check(struct ext4_btree_root * root, >> + struct ext4_btree_node *node) >> +{ >> + int i; >> + struct ext4_btree_rec *rec, *lrec = NULL; >> + int num_recs; >> + >> + if (node == NULL) >> + return -1; >> + >> + num_recs = ext4_btree_get_numrecs(node); >> + if (root->node != node) { >> + // non-root node >> + if (num_recs < EXT4_BTREE_MIN_RECS || >> + num_recs > EXT4_BTREE_MAX_RECS) { > > I think it's still necessary to check !(num_recs > EXT4_BTREE_MAX_RECS) for > root nodes. > agree. > Also, there's no check of CRC, leftsib, rightsib, or blockno. Will add check for those fields. For now they are just fields that I plan to add/use to assist searching. Once they are being used the propoer checking/validating need to to handled properly too. > >> + ext4_btree_trace("Invalid numrecs %d in node %lld - " \ >> + "level %d\n", >> + num_recs, >> + ext4_btree_get_blockno(node), >> + ext4_btree_get_level(node)); >> + return -2; > > Wading in a bit here, but ... return -EUCLEAN? > > More generally, are these int returns supposed to be regular error codes? > yes, yes, the error handling should be more coded in kernel-regular way. I will go through the error code one more >> + } >> + } >> + for (i = 0; i < num_recs; i++) { >> + rec = ext4_btree_rec_addr(&root->geo, node, i); >> + if (lrec != NULL && >> + ext4_btree_key_comp(&root->geo, &lrec->key, &rec->key) >= 0) { >> + ext4_btree_trace("Rec are not sorted in block 0x%llx" \ >> + "lkey %d 0x%llx key %d 0x%llx \n", >> + ext4_btree_get_blockno(node), >> + i - 1, lrec->key.blockno, i, rec->key.blockno); >> + return -3; >> + } >> + } >> + return 0; >> +} >> + >> +int >> +ext4_btree_valid_check(struct ext4_btree_root *root) >> +{ >> + struct ext4_btree_search_path spath; > > 96 bytes on the stack, hm. I guess it's not likely to nest too many levels > deep. > >> + struct ext4_btree_header * header; >> + struct ext4_btree_node * node; >> + struct ext4_btree_val * val; >> + int level; >> + int rootlevel; >> + int pos; >> + int numkeys; >> + int result; >> + >> + if (root == NULL || root->node == NULL) { >> + return 0; >> + } >> + /* check geo */ >> + if (root->geo.header_len == 0 || >> + root->geo.node_len == 0 || >> + root->geo.key_len == 0 || >> + root->geo.val_len == 0 || >> + root->geo.rec_len == 0 || >> + root->geo.max_pairs == 0 || >> + root->geo.max_records == 0) >> + { >> + ext4_btree_trace("tree geo is invalid %d %d %d %d %d %d %d\n", >> + root->geo.header_len, >> + root->geo.node_len, >> + root->geo.key_len, >> + root->geo.val_len, >> + root->geo.rec_len, >> + root->geo.max_pairs, >> + root->geo.max_records); >> + return -1; >> + } >> + if (root->geo.max_pairs % 2 == 1 || root->geo.max_records % 2 == 1) { >> + ext4_btree_trace("tree geo does not support odd pairs %d %d\n", > > Oh? I'm a little surprised by this requirement, since the header didn't say > anything about it. > Ah.. requiring the # of pair to be odd is just a tempory thing, this makes coding/logic simple when split and merge.. I do plan to fix it later. There isnt hard requirement that key/value pairs has to be odd. Thanks for catching this, I will fix it next time. > (Also, seeing as those two values are known at compile time, this could be > a compiletime_assert()?) > >> + root->geo.max_pairs, >> + root->geo.max_records); >> + return -1; >> + } >> + >> + /* check tree */ >> + ext4_btree_init_search_path(&spath); >> + node = root->node; >> + level = rootlevel = ext4_btree_get_level(node); >> + pos = 0; > > Should we check level < 8 here? > Sure, I will add that. >> + while (level >= 0) { >> + spath.node[level] = node; >> + spath.pos[level] = pos; >> + header = ext4_btree_header_addr(node); >> + if (header->btree_magic != REF_BTREE_MAGIC) { >> + ext4_btree_trace("node 0x%p block %lld has no MAGIC stamp. level %d pos %d\n", >> + node, ext4_btree_get_blockno(node), level, pos); >> + return -1; >> + } > > I think there needs to be a check for header->level and header->numkeys here. > > (Ok, enough for now.) > Yes, that's right. Thanks a lot for going through this and share your comments! Mingming > --D > >> + if (level > 0) { >> + if (pos == 0) { >> + result = ext4_btree_index_node_check(root, >> + node); >> + if (result != 0) >> + return result; >> + } >> + numkeys = ext4_btree_get_numkeys(node); >> + if (pos == numkeys) { >> + if (level == rootlevel) >> + break; /* reach the end >> + /* go to parent node */ >> + level ++; >> + node = spath.node[level]; >> + pos = spath.pos[level] + 1; >> + } >> + else { >> + /* go to next child node */ >> + level --; >> + val = ext4_btree_val_addr(&root->geo, >> + node, pos); >> + node = fs_get_btree_node(val->blockno); >> + pos = 0; >> + } >> + } >> + else { >> + result = ext4_btree_rec_node_check(root, node); >> + if (result != 0) >> + return result; >> + if (level == rootlevel) >> + break; // reach the end >> + /* go to parent node; */ >> + level ++; >> + node = spath.node[level]; >> + pos = spath.pos[level] + 1; >> + } >> + } >> + return 0; >> +} >> -- >> 1.9.1 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html