Snapshot deletion is done by walking down the btree and dropping references as you go. For a given node or leaf, if dropping the reference count will free the block, all of the extents that block points to will also have their reference counts dropped. If dropping the reference count will not free a block, the entire subtree pointed to by that block is skipped. Since most of the metadata lives in leaves, a reference cache is maintained in ram of the extents a given leaf has references on. This limits the number of btree leaves that need to be read while deleting old transactions at run time. Snapshot deletion happens incrementally. The metadata for each root indicates how far the last delete run got into the tree. This allows smaller atomic units for snapshot deletion, and it doesn't stall transaction commit with long running operations. It also means that once snapshot deletion starts, that tree must not be accessed for reading or writing outside of the snapshot deletion code. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> --- a/fs/btrfs/extent-tree.c 2009-01-08 09:56:31.803215082 -0500 +++ b/fs/btrfs/extent-tree.c 2009-01-08 09:56:07.321458970 -0500 @@ -3334,3 +3334,498 @@ buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); return buf; } + +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf) +{ + u64 leaf_owner; + u64 leaf_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int nritems; + int ret; + + BUG_ON(!btrfs_is_leaf(leaf)); + nritems = btrfs_header_nritems(leaf); + leaf_owner = btrfs_header_owner(leaf); + leaf_generation = btrfs_header_generation(leaf); + + for (i = 0; i < nritems; i++) { + u64 disk_bytenr; + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + /* + * FIXME make sure to insert a trans record that + * repeats the snapshot del on crash + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + continue; + + ret = __btrfs_free_extent(trans, root, disk_bytenr, + btrfs_file_extent_disk_num_bytes(leaf, fi), + leaf->start, leaf_owner, leaf_generation, + key.objectid, 0); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + return 0; +} + +static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) +{ + int i; + int ret; + struct btrfs_extent_info *info = ref->extents; + + for (i = 0; i < ref->nritems; i++) { + ret = __btrfs_free_extent(trans, root, info->bytenr, + info->num_bytes, ref->bytenr, + ref->owner, ref->generation, + info->objectid, 0); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + BUG_ON(ret); + info++; + } + + return 0; +} + +static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, + u64 len, u32 *refs) +{ + int ret; + + ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + +#if 0 /* some debugging code in case we see problems here */ + /* if the refs count is one, it won't get increased again. But + * if the ref count is > 1, someone may be decreasing it at + * the same time we are. + */ + if (*refs != 1) { + struct extent_buffer *eb = NULL; + eb = btrfs_find_create_tree_block(root, start, len); + if (eb) + btrfs_tree_lock(eb); + + mutex_lock(&root->fs_info->alloc_mutex); + ret = lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); + + if (eb) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (*refs == 1) { + printk(KERN_ERR "btrfs block %llu went down to one " + "during drop_snap\n", (unsigned long long)start); + } + + } +#endif + + cond_resched(); + return ret; +} + +/* + * helper function for drop_snapshot, this walks down the tree dropping ref + * counts as it goes. + */ +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + struct btrfs_leaf_ref *ref; + u32 blocksize; + int ret; + u32 refs; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + path->nodes[*level]->len, &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + /* + * walk down to the last node level and free all the leaves + */ + while (*level >= 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) { + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + path->slots[*level]++; + + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + root_owner, root_gen, + *level - 1, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + continue; + } + /* + * at this point, we have a single ref, and since the + * only place referencing this extent is a dead root + * the reference count should never go higher. + * So, we don't need to check it again + */ + if (*level == 1) { + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + *level = 0; + break; + } + } + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { + free_extent_buffer(next); + + next = read_tree_block(root, bytenr, blocksize, + ptr_gen); + cond_resched(); +#if 0 + /* + * this is a debugging check and can go away + * the ref should never go all the way down to 1 + * at this point + */ + ret = lookup_extent_ref(NULL, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + WARN_ON(refs != 1); +#endif + } + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } +out: + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) { + parent = path->nodes[*level]; + bytenr = path->nodes[*level]->start; + } else { + parent = path->nodes[*level + 1]; + bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); + } + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, root_owner, root_gen, + *level, 1); + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + BUG_ON(ret); + + cond_resched(); + return 0; +} + +/* + * helper function for drop_subtree, this function is similar to + * walk_down_tree. The main difference is that it checks reference + * counts while tree blocks are locked. + */ +static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 refs; + int ret; + + cur = path->nodes[*level]; + ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, + &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + while (*level >= 0) { + cur = path->nodes[*level]; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + clean_tree_block(trans, root, cur); + break; + } + if (path->slots[*level] >= btrfs_header_nritems(cur)) { + clean_tree_block(trans, root, cur); + break; + } + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + + next = read_tree_block(root, bytenr, blocksize, ptr_gen); + btrfs_tree_lock(next); + + ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + if (refs > 1) { + parent = path->nodes[*level]; + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + *level - 1, 1); + BUG_ON(ret); + path->slots[*level]++; + btrfs_tree_unlock(next); + free_extent_buffer(next); + continue; + } + + *level = btrfs_header_level(next); + path->nodes[*level] = next; + path->slots[*level] = 0; + path->locks[*level] = 1; + cond_resched(); + } +out: + parent = path->nodes[*level + 1]; + bytenr = path->nodes[*level]->start; + blocksize = path->nodes[*level]->len; + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, btrfs_header_owner(parent), + btrfs_header_generation(parent), *level, 1); + BUG_ON(ret); + + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + cond_resched(); + return 0; +} + +/* + * helper for dropping snapshots. This walks back up the tree in the path + * to find the first node higher up where we haven't yet gone through + * all the slots + */ +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int *level, int max_level) +{ + u64 root_owner; + u64 root_gen; + struct btrfs_root_item *root_item = &root->root_item; + int i; + int slot; + int ret; + + for (i = *level; i < max_level && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); + memcpy(&root_item->drop_progress, + &disk_key, sizeof(disk_key)); + root_item->drop_level = i; + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + clean_tree_block(trans, root, path->nodes[*level]); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->start, + path->nodes[*level]->len, + parent->start, root_owner, + root_gen, *level, 1); + BUG_ON(ret); + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + struct btrfs_root_item *root_item = &root->root_item; + + WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(root->node); + orig_level = level; + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + path->nodes[level] = root->node; + extent_buffer_get(root->node); + path->slots[level] = 0; + } else { + struct btrfs_key key; + struct btrfs_disk_key found_key; + struct extent_buffer *node; + + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + level = root_item->drop_level; + path->lowest_level = level; + wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (wret < 0) { + ret = wret; + goto out; + } + node = path->nodes[level]; + btrfs_node_key(node, &found_key, path->slots[level]); + WARN_ON(memcmp(&found_key, &root_item->drop_progress, + sizeof(found_key))); + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } + } + while (1) { + wret = walk_down_tree(trans, root, path, &level); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_tree(trans, root, path, &level, + BTRFS_MAX_LEVEL); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + if (trans->transaction->in_commit) { + ret = -EAGAIN; + break; + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } +out: + btrfs_free_path(path); + return ret; +} + -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html