On Wed, 7 Jan 2009 22:56:52 -0500 Chris Mason <chris.mason@xxxxxxxxxx> wrote: > All block and extent pointers in btrfs use a logical address space. This file > is responsible for mapping that logical address to a physical block on disk. > > It implements raid0, raid1 and raid10. The block groups used by extent-tree.c > are allocated from each device in chunks of ~1GB in size. This file maintains > the chunk tree and device allocation tree that are used to record the > areas of each disk that are in use. > > Each super block has a bootstrapping section that is used to translate > the extents in the chunk tree. The chunk tree is used to translate all > of the other extents in the filesystem to a physical block on disk. Where would one go to find out about the user-visible management of these devices? > This file also provides async bio submission code. The bios are often > sent down by async helper threads that are supposed to be working on > checksumming or other cpu intensive tasks. We don't want those tasks > waiting for a free request, so a helper thread is used here that is willing > to actually wait on the drive. Seems odd. Why have a thread to wait on IO? Why not just fire-and-forget, and then only block if some caller wishes to use the results of that IO? > > ... > > +struct map_lookup { > + u64 type; > + int io_align; > + int io_width; > + int stripe_len; > + int sector_size; > + int num_stripes; > + int sub_stripes; > + struct btrfs_bio_stripe stripes[]; > +}; this structure looks interesting. > +static int init_first_rw_device(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, > + struct btrfs_device *device); > +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); > + > +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ > + (sizeof(struct btrfs_bio_stripe) * (n))) > + > +static DEFINE_MUTEX(uuid_mutex); > +static LIST_HEAD(fs_uuids); > + > +void btrfs_lock_volumes(void) > +{ > + mutex_lock(&uuid_mutex); > +} > + > +void btrfs_unlock_volumes(void) > +{ > + mutex_unlock(&uuid_mutex); > +} > + > +static void lock_chunks(struct btrfs_root *root) > +{ > + mutex_lock(&root->fs_info->chunk_mutex); > +} > + > +static void unlock_chunks(struct btrfs_root *root) > +{ > + mutex_unlock(&root->fs_info->chunk_mutex); > +} > + > +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) > +{ > + struct btrfs_device *device; > + WARN_ON(fs_devices->opened); > + while (!list_empty(&fs_devices->devices)) { > + device = list_entry(fs_devices->devices.next, > + struct btrfs_device, dev_list); > + list_del(&device->dev_list); > + kfree(device->name); > + kfree(device); > + } > + kfree(fs_devices); > +} Called under btrfs_lock_volumes(), one assumes/hopes. A WARN_ON(!mutex_is_locked()) is a nice way of documenting such things. > +int btrfs_cleanup_fs_uuids(void) > +{ > + struct btrfs_fs_devices *fs_devices; > + > + while (!list_empty(&fs_uuids)) { > + fs_devices = list_entry(fs_uuids.next, > + struct btrfs_fs_devices, list); > + list_del(&fs_devices->list); > + free_fs_devices(fs_devices); > + } > + return 0; > +} Ditto. > +static noinline struct btrfs_device *__find_device(struct list_head *head, > + u64 devid, u8 *uuid) It's not possible for the code reader to work out why all these noinlines are here (this reader, at least). > +{ > + struct btrfs_device *dev; > + struct list_head *cur; > + > + list_for_each(cur, head) { > + dev = list_entry(cur, struct btrfs_device, dev_list); > + if (dev->devid == devid && > + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { > + return dev; > + } > + } > + return NULL; > +} > + > +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) > +{ > + struct list_head *cur; > + struct btrfs_fs_devices *fs_devices; > + > + list_for_each(cur, &fs_uuids) { > + fs_devices = list_entry(cur, struct btrfs_fs_devices, list); > + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) > + return fs_devices; > + } > + return NULL; > +} Called under btrfs_lock_volumes()... > +/* > + * we try to collect pending bios for a device so we don't get a large > + * number of procs sending bios down to the same device. This greatly > + * improves the schedulers ability to collect and merge the bios. eh? The block layer already implements limiting. Why does _not_ sending bios to a device aid the IO scheduler's merging activity? The comment needs help, I suspect. > + * But, it also turns into a long list of bios to process and that is sure > + * to eventually make the worker thread block. Why is that bad? If there's a large amount of IO queued then blocking seems an appropriate thing to do. > The solution here is to > + * make some progress and then put this work struct back at the end of > + * the list if the block device is congested. This way, multiple devices > + * can make progress from a single worker thread. hrm, OK. I think. > + */ > +static noinline int run_scheduled_bios(struct btrfs_device *device) > +{ > + struct bio *pending; > + struct backing_dev_info *bdi; > + struct btrfs_fs_info *fs_info; > + struct bio *tail; > + struct bio *cur; > + int again = 0; > + unsigned long num_run = 0; > + unsigned long limit; > + > + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; device->bdev->bd_inode_backing_dev_info is equivalent and shorter, I suspect. I forget why we added that. > + fs_info = device->dev_root->fs_info; > + limit = btrfs_async_submit_limit(fs_info); > + limit = limit * 2 / 3; ?? > +loop: > + spin_lock(&device->io_lock); > + > + /* take all the bios off the list at once and process them > + * later on (without the lock held). But, remember the > + * tail and other pointers so the bios can be properly reinserted > + * into the list if we hit congestion > + */ > + pending = device->pending_bios; > + tail = device->pending_bio_tail; > + WARN_ON(pending && !tail); > + device->pending_bios = NULL; > + device->pending_bio_tail = NULL; > + > + /* > + * if pending was null this time around, no bios need processing > + * at all and we can stop. Otherwise it'll loop back up again > + * and do an additional check so no bios are missed. > + * > + * device->running_pending is used to synchronize with the > + * schedule_bio code. > + */ > + if (pending) { > + again = 1; > + device->running_pending = 1; > + } else { > + again = 0; > + device->running_pending = 0; > + } > + spin_unlock(&device->io_lock); > + > + while (pending) { > + cur = pending; > + pending = pending->bi_next; > + cur->bi_next = NULL; > + atomic_dec(&fs_info->nr_async_bios); > + > + if (atomic_read(&fs_info->nr_async_bios) < limit && > + waitqueue_active(&fs_info->async_submit_wait)) > + wake_up(&fs_info->async_submit_wait); > + > + BUG_ON(atomic_read(&cur->bi_cnt) == 0); > + bio_get(cur); > + submit_bio(cur->bi_rw, cur); > + bio_put(cur); What do the bio_get()/bio_put() do? Looks wrong, or unneeded. > + num_run++; > + > + /* > + * we made progress, there is more work to do and the bdi > + * is now congested. Back off and let other work structs > + * run instead > + */ > + if (pending && bdi_write_congested(bdi) && Does this function handle only writes? If it handles reads, why do we test only write congestion? (please add comments if the questions aren't painfully stupid). > + fs_info->fs_devices->open_devices > 1) { > + struct bio *old_head; > + > + spin_lock(&device->io_lock); > + > + old_head = device->pending_bios; > + device->pending_bios = pending; > + if (device->pending_bio_tail) > + tail->bi_next = old_head; > + else > + device->pending_bio_tail = tail; > + > + spin_unlock(&device->io_lock); > + btrfs_requeue_work(&device->work); > + goto done; > + } > + } > + if (again) > + goto loop; > +done: > + return 0; > +} > + > +static void pending_bios_fn(struct btrfs_work *work) > +{ > + struct btrfs_device *device; > + > + device = container_of(work, struct btrfs_device, work); > + run_scheduled_bios(device); > +} > + > +static noinline int device_list_add(const char *path, > + struct btrfs_super_block *disk_super, > + u64 devid, struct btrfs_fs_devices **fs_devices_ret) > +{ > + struct btrfs_device *device; > + struct btrfs_fs_devices *fs_devices; > + u64 found_transid = btrfs_super_generation(disk_super); > + > + fs_devices = find_fsid(disk_super->fsid); > + if (!fs_devices) { > + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); Wondering why it's GFP_NOFS. Presumably this function is supposed to be called under btrfs_lock_volumes(), and we also call that function on the memory reclaim path? > + if (!fs_devices) > + return -ENOMEM; > + INIT_LIST_HEAD(&fs_devices->devices); > + INIT_LIST_HEAD(&fs_devices->alloc_list); > + list_add(&fs_devices->list, &fs_uuids); > + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); > + fs_devices->latest_devid = devid; > + fs_devices->latest_trans = found_transid; > + device = NULL; > + } else { > + device = __find_device(&fs_devices->devices, devid, > + disk_super->dev_item.uuid); > + } > + if (!device) { > + if (fs_devices->opened) > + return -EBUSY; > + > + device = kzalloc(sizeof(*device), GFP_NOFS); > + if (!device) { > + /* we can safely leave the fs_devices entry around */ > + return -ENOMEM; > + } > + device->devid = devid; > + device->work.func = pending_bios_fn; > + memcpy(device->uuid, disk_super->dev_item.uuid, > + BTRFS_UUID_SIZE); > + device->barriers = 1; > + spin_lock_init(&device->io_lock); > + device->name = kstrdup(path, GFP_NOFS); > + if (!device->name) { > + kfree(device); > + return -ENOMEM; > + } > + INIT_LIST_HEAD(&device->dev_alloc_list); > + list_add(&device->dev_list, &fs_devices->devices); > + device->fs_devices = fs_devices; > + fs_devices->num_devices++; > + } > + > + if (found_transid > fs_devices->latest_trans) { > + fs_devices->latest_devid = devid; > + fs_devices->latest_trans = found_transid; > + } > + *fs_devices_ret = fs_devices; > + return 0; > +} > + > +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) I wonder why this function exists. > +{ > + struct btrfs_fs_devices *fs_devices; > + struct btrfs_device *device; > + struct btrfs_device *orig_dev; > + > + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); > + if (!fs_devices) > + return ERR_PTR(-ENOMEM); > + > + INIT_LIST_HEAD(&fs_devices->devices); > + INIT_LIST_HEAD(&fs_devices->alloc_list); > + INIT_LIST_HEAD(&fs_devices->list); > + fs_devices->latest_devid = orig->latest_devid; > + fs_devices->latest_trans = orig->latest_trans; > + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); > + > + list_for_each_entry(orig_dev, &orig->devices, dev_list) { > + device = kzalloc(sizeof(*device), GFP_NOFS); > + if (!device) > + goto error; > + > + device->name = kstrdup(orig_dev->name, GFP_NOFS); > + if (!device->name) > + goto error; We leaked *device? > + device->devid = orig_dev->devid; > + device->work.func = pending_bios_fn; > + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); > + device->barriers = 1; > + spin_lock_init(&device->io_lock); > + INIT_LIST_HEAD(&device->dev_list); > + INIT_LIST_HEAD(&device->dev_alloc_list); > + > + list_add(&device->dev_list, &fs_devices->devices); > + device->fs_devices = fs_devices; > + fs_devices->num_devices++; > + } > + return fs_devices; > +error: > + free_fs_devices(fs_devices); > + return ERR_PTR(-ENOMEM); > +} > + > +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) whatdoesthisdo > +{ > + struct list_head *tmp; > + struct list_head *cur; > + struct btrfs_device *device; > + > + mutex_lock(&uuid_mutex); btrfs_lock_volumes()? > +again: > + list_for_each_safe(cur, tmp, &fs_devices->devices) { > + device = list_entry(cur, struct btrfs_device, dev_list); > + if (device->in_fs_metadata) > + continue; > + > + if (device->bdev) { > + close_bdev_exclusive(device->bdev, device->mode); > + device->bdev = NULL; > + fs_devices->open_devices--; > + } > + if (device->writeable) { > + list_del_init(&device->dev_alloc_list); > + device->writeable = 0; > + fs_devices->rw_devices--; > + } Why are we bothering to modify *device? It's about to be freed. > + list_del_init(&device->dev_list); > + fs_devices->num_devices--; > + kfree(device->name); > + kfree(device); > + } > + > + if (fs_devices->seed) { > + fs_devices = fs_devices->seed; > + goto again; > + } > + > + mutex_unlock(&uuid_mutex); ... > + return 0; > +} > + > +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) > +{ > + struct list_head *cur; > + struct btrfs_device *device; > + > + if (--fs_devices->opened > 0) > + return 0; > + > + list_for_each(cur, &fs_devices->devices) { > + device = list_entry(cur, struct btrfs_device, dev_list); > + if (device->bdev) { This reader is wondering what btrfs_device.bdev==NULL means. How would one go about determining this? > + close_bdev_exclusive(device->bdev, device->mode); > + fs_devices->open_devices--; > + } > + if (device->writeable) { > + list_del_init(&device->dev_alloc_list); > + fs_devices->rw_devices--; > + } > + > + device->bdev = NULL; > + device->writeable = 0; > + device->in_fs_metadata = 0; What does in_fs_metadata mean? Perhaps those three fields should have type `bool'. > + } > + WARN_ON(fs_devices->open_devices); > + WARN_ON(fs_devices->rw_devices); > + fs_devices->opened = 0; > + fs_devices->seeding = 0; What does the `seeding' field mean? It's a boolean, I assume.. > + return 0; > +} > + > +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) > +{ > + struct btrfs_fs_devices *seed_devices = NULL; > + int ret; > + > + mutex_lock(&uuid_mutex); btrfs_lock_volumes()? (Please check whole fs for consistency here) Seems strange that a lock called "uuid_mutex" is protecting a pile of things which don't seem very uuid-related. > + ret = __btrfs_close_devices(fs_devices); > + if (!fs_devices->opened) { > + seed_devices = fs_devices->seed; > + fs_devices->seed = NULL; > + } > + mutex_unlock(&uuid_mutex); > + > + while (seed_devices) { > + fs_devices = seed_devices; > + seed_devices = fs_devices->seed; > + __btrfs_close_devices(fs_devices); > + free_fs_devices(fs_devices); > + } > + return ret; > +} > + > +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, > + fmode_t flags, void *holder) > +{ > + struct block_device *bdev; > + struct list_head *head = &fs_devices->devices; > + struct list_head *cur; > + struct btrfs_device *device; > + struct block_device *latest_bdev = NULL; > + struct buffer_head *bh; > + struct btrfs_super_block *disk_super; > + u64 latest_devid = 0; > + u64 latest_transid = 0; > + u64 devid; > + int seeding = 1; > + int ret = 0; > + > + list_for_each(cur, head) { > + device = list_entry(cur, struct btrfs_device, dev_list); list_for_each_entry() > + if (device->bdev) > + continue; > + if (!device->name) > + continue; What's the algorithm for understanding the above four lines? > + bdev = open_bdev_exclusive(device->name, flags, holder); > + if (IS_ERR(bdev)) { > + printk(KERN_INFO "open %s failed\n", device->name); > + goto error; > + } > + set_blocksize(bdev, 4096); hm. What's the situation with btrfs blocksizes? > + bh = btrfs_read_dev_super(bdev); > + if (!bh) > + goto error_close; > + > + disk_super = (struct btrfs_super_block *)bh->b_data; > + devid = le64_to_cpu(disk_super->dev_item.devid); > + if (devid != device->devid) > + goto error_brelse; > + > + if (memcmp(device->uuid, disk_super->dev_item.uuid, > + BTRFS_UUID_SIZE)) > + goto error_brelse; > + > + device->generation = btrfs_super_generation(disk_super); > + if (!latest_transid || device->generation > latest_transid) { > + latest_devid = devid; > + latest_transid = device->generation; > + latest_bdev = bdev; > + } > + > + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { > + device->writeable = 0; > + } else { > + device->writeable = !bdev_read_only(bdev); > + seeding = 0; > + } > + > + device->bdev = bdev; > + device->in_fs_metadata = 0; > + device->mode = flags; > + > + fs_devices->open_devices++; > + if (device->writeable) { > + fs_devices->rw_devices++; > + list_add(&device->dev_alloc_list, > + &fs_devices->alloc_list); > + } > + continue; > + > +error_brelse: > + brelse(bh); brelse() is only needed in situations where the pointer might be NULL. > +error_close: > + close_bdev_exclusive(bdev, FMODE_READ); > +error: > + continue; > + } > + if (fs_devices->open_devices == 0) { > + ret = -EIO; > + goto out; > + } > + fs_devices->seeding = seeding; > + fs_devices->opened = 1; > + fs_devices->latest_bdev = latest_bdev; > + fs_devices->latest_devid = latest_devid; > + fs_devices->latest_trans = latest_transid; > + fs_devices->total_rw_bytes = 0; > +out: > + return ret; > +} > + > +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, > + fmode_t flags, void *holder) > +{ > + int ret; > + > + mutex_lock(&uuid_mutex); > + if (fs_devices->opened) { > + fs_devices->opened++; > + ret = 0; > + } else { > + ret = __btrfs_open_devices(fs_devices, flags, holder); > + } > + mutex_unlock(&uuid_mutex); > + return ret; > +} > + > +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, > + struct btrfs_fs_devices **fs_devices_ret) > +{ > + struct btrfs_super_block *disk_super; > + struct block_device *bdev; > + struct buffer_head *bh; > + int ret; > + u64 devid; > + u64 transid; > + > + mutex_lock(&uuid_mutex); > + > + bdev = open_bdev_exclusive(path, flags, holder); > + > + if (IS_ERR(bdev)) { > + ret = PTR_ERR(bdev); > + goto error; > + } > + > + ret = set_blocksize(bdev, 4096); > + if (ret) > + goto error_close; > + bh = btrfs_read_dev_super(bdev); > + if (!bh) { > + ret = -EIO; > + goto error_close; > + } > + disk_super = (struct btrfs_super_block *)bh->b_data; <looks at struct btrfs_super_block> It needs work, I think. This: __le64 compat_ro_flags; __le64 incompat_flags; __le16 csum_type; u8 root_level; u8 chunk_root_level; u8 log_root_level; struct btrfs_dev_item dev_item; doesn't look like it will successfully port between architectures. Also `struct btrfs_dev_item' has three le32's wedged between le64's. I suppose that the __packed will help in some cases (but not the btrfs_super_block, surely). However it seems unwise/incautious/ugly to rely upon __packed in this manner. Please do s/__attribute__ ((__packed__))/__packed/g > + devid = le64_to_cpu(disk_super->dev_item.devid); > + transid = btrfs_super_generation(disk_super); > + if (disk_super->label[0]) > + printk(KERN_INFO "device label %s ", disk_super->label); > + else { > + /* FIXME, make a readl uuid parser */ what's that? > + printk(KERN_INFO "device fsid %llx-%llx ", > + *(unsigned long long *)disk_super->fsid, > + *(unsigned long long *)(disk_super->fsid + 8)); > + } > + printk(KERN_INFO "devid %llu transid %llu %s\n", > + (unsigned long long)devid, (unsigned long long)transid, path); > + ret = device_list_add(path, disk_super, devid, fs_devices_ret); > + > + brelse(bh); bh is non-NULL here > +error_close: > + close_bdev_exclusive(bdev, flags); > +error: > + mutex_unlock(&uuid_mutex); > + return ret; > +} > + > +/* > + * this uses a pretty simple search, the expectation is that it is > + * called very infrequently and that a given device has a small number > + * of extents > + */ > +static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, > + struct btrfs_device *device, > + u64 num_bytes, u64 *start) > +{ > + struct btrfs_key key; > + struct btrfs_root *root = device->dev_root; > + struct btrfs_dev_extent *dev_extent = NULL; > + struct btrfs_path *path; > + u64 hole_size = 0; > + u64 last_byte = 0; > + u64 search_start = 0; > + u64 search_end = device->total_bytes; > + int ret; > + int slot = 0; > + int start_found; > + struct extent_buffer *l; > + > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > + path->reada = 2; > + start_found = 0; > + > + /* FIXME use last free of some kind */ > + > + /* we don't want to overwrite the superblock on the drive, > + * so we make sure to start at an offset of at least 1MB > + */ > + search_start = max((u64)1024 * 1024, search_start); max_t > + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) > + search_start = max(root->fs_info->alloc_start, search_start); > + > + key.objectid = device->devid; > + key.offset = search_start; > + key.type = BTRFS_DEV_EXTENT_KEY; > + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); > + if (ret < 0) > + goto error; > + ret = btrfs_previous_item(root, path, 0, key.type); > + if (ret < 0) > + goto error; > + l = path->nodes[0]; > + btrfs_item_key_to_cpu(l, &key, path->slots[0]); > + while (1) { > + l = path->nodes[0]; > + slot = path->slots[0]; > + if (slot >= btrfs_header_nritems(l)) { > + ret = btrfs_next_leaf(root, path); > + if (ret == 0) > + continue; > + if (ret < 0) > + goto error; > +no_more_items: > + if (!start_found) { > + if (search_start >= search_end) { > + ret = -ENOSPC; Hey. Shouldn't that go BUG()? > + goto error; > + } > + *start = search_start; > + start_found = 1; > + goto check_pending; > + } > + *start = last_byte > search_start ? > + last_byte : search_start; max()? > + if (search_end <= *start) { > + ret = -ENOSPC; > + goto error; > + } > + goto check_pending; > + } > + btrfs_item_key_to_cpu(l, &key, slot); > + > + if (key.objectid < device->devid) > + goto next; > + > + if (key.objectid > device->devid) > + goto no_more_items; > + > + if (key.offset >= search_start && key.offset > last_byte && > + start_found) { > + if (last_byte < search_start) > + last_byte = search_start; > + hole_size = key.offset - last_byte; > + if (key.offset > last_byte && > + hole_size >= num_bytes) { > + *start = last_byte; > + goto check_pending; > + } > + } > + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) > + goto next; > + > + start_found = 1; > + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); > + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); > +next: > + path->slots[0]++; > + cond_resched(); > + } > +check_pending: > + /* we have to make sure we didn't find an extent that has already > + * been allocated by the map tree or the original allocation > + */ > + BUG_ON(*start < search_start); Is it triggerable by on-disk corruption? > + if (*start + num_bytes > search_end) { > + ret = -ENOSPC; > + goto error; > + } > + /* check for pending inserts here */ > + ret = 0; > + > +error: > + btrfs_free_path(path); > + return ret; > +} > + > +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, > + struct btrfs_device *device, > + u64 start) > +{ > + int ret; > + struct btrfs_path *path; > + struct btrfs_root *root = device->dev_root; > + struct btrfs_key key; > + struct btrfs_key found_key; > + struct extent_buffer *leaf = NULL; > + struct btrfs_dev_extent *extent = NULL; > + > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > + > + key.objectid = device->devid; > + key.offset = start; > + key.type = BTRFS_DEV_EXTENT_KEY; > + > + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); <looks at btrfs_search_slot> * if ins_len > 0, nodes and leaves will be split as we walk down the * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if * possible) The (ab)use of the sign of formal arg `ins_len' seems a bit nasty? > + if (ret > 0) { > + ret = btrfs_previous_item(root, path, key.objectid, > + BTRFS_DEV_EXTENT_KEY); > + BUG_ON(ret); > + leaf = path->nodes[0]; > + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); > + extent = btrfs_item_ptr(leaf, path->slots[0], > + struct btrfs_dev_extent); > + BUG_ON(found_key.offset > start || found_key.offset + > + btrfs_dev_extent_length(leaf, extent) < start); > + ret = 0; > + } else if (ret == 0) { > + leaf = path->nodes[0]; > + extent = btrfs_item_ptr(leaf, path->slots[0], > + struct btrfs_dev_extent); > + } > + BUG_ON(ret); > + > + if (device->bytes_used > 0) > + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); > + ret = btrfs_del_item(trans, root, path); > + BUG_ON(ret); > + > + btrfs_free_path(path); > + return ret; > +} > + > > ... > > +static noinline int find_next_chunk(struct btrfs_root *root, > + u64 objectid, u64 *offset) > +{ > + struct btrfs_path *path; > + int ret; > + struct btrfs_key key; > + struct btrfs_chunk *chunk; > + struct btrfs_key found_key; > + > + path = btrfs_alloc_path(); > + BUG_ON(!path); Stuff like this will eventually have to be weeded out and fixed. It at least needs a big FIXME the instant it is typed in. Quietly slipping it in there in this manner is just creating more work for yourselves. > + key.objectid = objectid; > + key.offset = (u64)-1; > + key.type = BTRFS_CHUNK_ITEM_KEY; > + > + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > + if (ret < 0) > + goto error; > + > + BUG_ON(ret == 0); Is this triggerable by a corrupted fs? > + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); > + if (ret) { > + *offset = 0; > + } else { > + btrfs_item_key_to_cpu(path->nodes[0], &found_key, > + path->slots[0]); > + if (found_key.objectid != objectid) > + *offset = 0; > + else { > + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], > + struct btrfs_chunk); > + *offset = found_key.offset + > + btrfs_chunk_length(path->nodes[0], chunk); > + } > + } > + ret = 0; > +error: > + btrfs_free_path(path); > + return ret; > +} > + > > ... > > + > +/* > + * the device information is stored in the chunk root > + * the btrfs_device struct should be fully filled in > + */ > +int btrfs_add_device(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, > + struct btrfs_device *device) > +{ > + int ret; > + struct btrfs_path *path; > + struct btrfs_dev_item *dev_item; > + struct extent_buffer *leaf; > + struct btrfs_key key; > + unsigned long ptr; > + > + root = root->fs_info->chunk_root; > + > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > + > + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; > + key.type = BTRFS_DEV_ITEM_KEY; > + key.offset = device->devid; > + > + ret = btrfs_insert_empty_item(trans, root, path, &key, > + sizeof(*dev_item)); > + if (ret) > + goto out; > + > + leaf = path->nodes[0]; > + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); > + > + btrfs_set_device_id(leaf, dev_item, device->devid); > + btrfs_set_device_generation(leaf, dev_item, 0); > + btrfs_set_device_type(leaf, dev_item, device->type); > + btrfs_set_device_io_align(leaf, dev_item, device->io_align); > + btrfs_set_device_io_width(leaf, dev_item, device->io_width); > + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); > + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); > + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); > + btrfs_set_device_group(leaf, dev_item, 0); > + btrfs_set_device_seek_speed(leaf, dev_item, 0); > + btrfs_set_device_bandwidth(leaf, dev_item, 0); > + btrfs_set_device_start_offset(leaf, dev_item, 0); > + > + ptr = (unsigned long)btrfs_device_uuid(dev_item); > + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); > + ptr = (unsigned long)btrfs_device_fsid(dev_item); > + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); > + btrfs_mark_buffer_dirty(leaf); I expected a function called btrfs_mark_buffer_dirty() to be a wrapper around mark_buffer_dirty(). > + ret = 0; > +out: > + btrfs_free_path(path); > + return ret; > +} > + > > ... > > + > +int btrfs_rm_device(struct btrfs_root *root, char *device_path) > +{ > + struct btrfs_device *device; > + struct btrfs_device *next_device; > + struct block_device *bdev; > + struct buffer_head *bh = NULL; > + struct btrfs_super_block *disk_super; > + u64 all_avail; > + u64 devid; > + u64 num_devices; > + u8 *dev_uuid; > + int ret = 0; > + > + mutex_lock(&uuid_mutex); btrfs_lock_volumes() > + mutex_lock(&root->fs_info->volume_mutex); lock_chunks() (which needs renaming) > + all_avail = root->fs_info->avail_data_alloc_bits | > + root->fs_info->avail_system_alloc_bits | > + root->fs_info->avail_metadata_alloc_bits; > + > + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && > + root->fs_info->fs_devices->rw_devices <= 4) { > + printk(KERN_ERR "btrfs: unable to go below four devices " > + "on raid10\n"); > + ret = -EINVAL; > + goto out; > + } > + > + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && > + root->fs_info->fs_devices->rw_devices <= 2) { > + printk(KERN_ERR "btrfs: unable to go below two " > + "devices on raid1\n"); > + ret = -EINVAL; > + goto out; > + } > + > + if (strcmp(device_path, "missing") == 0) { <wonders what interface all this is implementing> > + struct list_head *cur; > + struct list_head *devices; > + struct btrfs_device *tmp; > + > + device = NULL; > + devices = &root->fs_info->fs_devices->devices; > + list_for_each(cur, devices) { > + tmp = list_entry(cur, struct btrfs_device, dev_list); > + if (tmp->in_fs_metadata && !tmp->bdev) { > + device = tmp; > + break; > + } > + } > + bdev = NULL; > + bh = NULL; > + disk_super = NULL; > + if (!device) { > + printk(KERN_ERR "btrfs: no missing devices found to " > + "remove\n"); > + goto out; > + } > + } else { > + bdev = open_bdev_exclusive(device_path, FMODE_READ, > + root->fs_info->bdev_holder); > + if (IS_ERR(bdev)) { > + ret = PTR_ERR(bdev); > + goto out; > + } > + > + set_blocksize(bdev, 4096); > + bh = btrfs_read_dev_super(bdev); > + if (!bh) { > + ret = -EIO; > + goto error_close; > + } > + disk_super = (struct btrfs_super_block *)bh->b_data; > + devid = le64_to_cpu(disk_super->dev_item.devid); > + dev_uuid = disk_super->dev_item.uuid; > + device = btrfs_find_device(root, devid, dev_uuid, > + disk_super->fsid); > + if (!device) { > + ret = -ENOENT; > + goto error_brelse; > + } > + } > + > + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { > + printk(KERN_ERR "btrfs: unable to remove the only writeable " > + "device\n"); > + ret = -EINVAL; > + goto error_brelse; > + } > + > + if (device->writeable) { > + list_del_init(&device->dev_alloc_list); > + root->fs_info->fs_devices->rw_devices--; > + } > + > + ret = btrfs_shrink_device(device, 0); > + if (ret) > + goto error_brelse; > + > + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); > + if (ret) > + goto error_brelse; > + > + device->in_fs_metadata = 0; > + list_del_init(&device->dev_list); > + device->fs_devices->num_devices--; > + > + next_device = list_entry(root->fs_info->fs_devices->devices.next, > + struct btrfs_device, dev_list); > + if (device->bdev == root->fs_info->sb->s_bdev) > + root->fs_info->sb->s_bdev = next_device->bdev; > + if (device->bdev == root->fs_info->fs_devices->latest_bdev) > + root->fs_info->fs_devices->latest_bdev = next_device->bdev; > + > + if (device->bdev) { > + close_bdev_exclusive(device->bdev, device->mode); > + device->bdev = NULL; > + device->fs_devices->open_devices--; > + } > + > + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; > + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); > + > + if (device->fs_devices->open_devices == 0) { > + struct btrfs_fs_devices *fs_devices; > + fs_devices = root->fs_info->fs_devices; > + while (fs_devices) { > + if (fs_devices->seed == device->fs_devices) > + break; > + fs_devices = fs_devices->seed; > + } > + fs_devices->seed = device->fs_devices->seed; > + device->fs_devices->seed = NULL; > + __btrfs_close_devices(device->fs_devices); > + free_fs_devices(device->fs_devices); > + } > + > + /* > + * at this point, the device is zero sized. We want to > + * remove it from the devices list and zero out the old super > + */ > + if (device->writeable) { > + /* make sure this device isn't detected as part of > + * the FS anymore > + */ > + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); > + set_buffer_dirty(bh); > + sync_dirty_buffer(bh); > + } > + > + kfree(device->name); > + kfree(device); > + ret = 0; > + > +error_brelse: > + brelse(bh); > +error_close: > + if (bdev) > + close_bdev_exclusive(bdev, FMODE_READ); > +out: > + mutex_unlock(&root->fs_info->volume_mutex); > + mutex_unlock(&uuid_mutex); > + return ret; > +} > + > +/* > + * does all the dirty work required for changing file system's UUID. > + */ > +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, > + struct btrfs_root *root) > +{ > + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; > + struct btrfs_fs_devices *old_devices; > + struct btrfs_fs_devices *seed_devices; > + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; > + struct btrfs_device *device; > + u64 super_flags; > + > + BUG_ON(!mutex_is_locked(&uuid_mutex)); Given the (regrettable?) existence of btrfs_lock_volumes() and btrfs_unlock_volumes(), it would make sense to add another regrettable wrapper for the above operation. > + if (!fs_devices->seeding) > + return -EINVAL; <Again wonders what "seeding" means> > + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); > + if (!seed_devices) > + return -ENOMEM; > + > + old_devices = clone_fs_devices(fs_devices); > + if (IS_ERR(old_devices)) { > + kfree(seed_devices); > + return PTR_ERR(old_devices); > + } > + > + list_add(&old_devices->list, &fs_uuids); > + > + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); > + seed_devices->opened = 1; > + INIT_LIST_HEAD(&seed_devices->devices); > + INIT_LIST_HEAD(&seed_devices->alloc_list); > + list_splice_init(&fs_devices->devices, &seed_devices->devices); > + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); > + list_for_each_entry(device, &seed_devices->devices, dev_list) { > + device->fs_devices = seed_devices; > + } unneeded braces <runs checkpatch> <checkpatch misses this - regression?> #2001: FILE: fs/btrfs/volumes.c:1930: + return calc_size * num_stripes; ^ ERROR: space prohibited after that '*' (ctx:WxW) #2072: FILE: fs/btrfs/volumes.c:2001: + max_chunk_size = calc_size * 2; ^ ERROR: space prohibited after that '*' (ctx:WxW) #2089: FILE: fs/btrfs/volumes.c:2018: + if (calc_size * num_stripes > max_chunk_size) { ^ ERROR: space prohibited after that '*' (ctx:WxW) #2105: FILE: fs/btrfs/volumes.c:2034: + min_free = calc_size * 2; ^ hm, they're all wrong. > + fs_devices->seeding = 0; > + fs_devices->num_devices = 0; > + fs_devices->open_devices = 0; > + fs_devices->seed = seed_devices; > + > + generate_random_uuid(fs_devices->fsid); > + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); > + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); > + super_flags = btrfs_super_flags(disk_super) & > + ~BTRFS_SUPER_FLAG_SEEDING; > + btrfs_set_super_flags(disk_super, super_flags); > + > + return 0; > +} > + > > ... > > + > +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) > +{ > + struct btrfs_trans_handle *trans; > + struct btrfs_device *device; > + struct block_device *bdev; > + struct list_head *cur; > + struct list_head *devices; > + struct super_block *sb = root->fs_info->sb; > + u64 total_bytes; > + int seeding_dev = 0; > + int ret = 0; > + > + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) > + return -EINVAL; > + > + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); > + if (!bdev) > + return -EIO; > + > + if (root->fs_info->fs_devices->seeding) { > + seeding_dev = 1; > + down_write(&sb->s_umount); > + mutex_lock(&uuid_mutex); > + } > + > + filemap_write_and_wait(bdev->bd_inode->i_mapping); I'm surprised that this can be safely done under ->s_umount. > + mutex_lock(&root->fs_info->volume_mutex); > + > + devices = &root->fs_info->fs_devices->devices; > + list_for_each(cur, devices) { > + device = list_entry(cur, struct btrfs_device, dev_list); > + if (device->bdev == bdev) { > + ret = -EEXIST; > + goto error; > + } > + } > + > + device = kzalloc(sizeof(*device), GFP_NOFS); > + if (!device) { > + /* we can safely leave the fs_devices entry around */ > + ret = -ENOMEM; > + goto error; > + } > + > + device->name = kstrdup(device_path, GFP_NOFS); > + if (!device->name) { > + kfree(device); > + ret = -ENOMEM; > + goto error; > + } > + > + ret = find_next_devid(root, &device->devid); > + if (ret) { > + kfree(device); Leaks device->name > + goto error; > + } > + > + trans = btrfs_start_transaction(root, 1); > + lock_chunks(root); > + > + device->barriers = 1; > + device->writeable = 1; > + device->work.func = pending_bios_fn; Should we be using INIT_WORK() here? It has lockdep implications. (Multiple instances). > + generate_random_uuid(device->uuid); > + spin_lock_init(&device->io_lock); > + device->generation = trans->transid; > + device->io_width = root->sectorsize; > + device->io_align = root->sectorsize; > + device->sector_size = root->sectorsize; > + device->total_bytes = i_size_read(bdev->bd_inode); > + device->dev_root = root->fs_info->dev_root; > + device->bdev = bdev; > + device->in_fs_metadata = 1; > + device->mode = 0; > + set_blocksize(device->bdev, 4096); > + > + if (seeding_dev) { > + sb->s_flags &= ~MS_RDONLY; > + ret = btrfs_prepare_sprout(trans, root); > + BUG_ON(ret); > + } > + > + device->fs_devices = root->fs_info->fs_devices; > + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); > + list_add(&device->dev_alloc_list, > + &root->fs_info->fs_devices->alloc_list); > + root->fs_info->fs_devices->num_devices++; > + root->fs_info->fs_devices->open_devices++; > + root->fs_info->fs_devices->rw_devices++; > + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; > + > + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); > + btrfs_set_super_total_bytes(&root->fs_info->super_copy, > + total_bytes + device->total_bytes); > + > + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); > + btrfs_set_super_num_devices(&root->fs_info->super_copy, > + total_bytes + 1); > + > + if (seeding_dev) { > + ret = init_first_rw_device(trans, root, device); > + BUG_ON(ret); > + ret = btrfs_finish_sprout(trans, root); > + BUG_ON(ret); > + } else { > + ret = btrfs_add_device(trans, root, device); > + } > + > + unlock_chunks(root); > + btrfs_commit_transaction(trans, root); > + > + if (seeding_dev) { > + mutex_unlock(&uuid_mutex); > + up_write(&sb->s_umount); > + > + ret = btrfs_relocate_sys_chunks(root); > + BUG_ON(ret); > + } > +out: > + mutex_unlock(&root->fs_info->volume_mutex); > + return ret; > +error: > + close_bdev_exclusive(bdev, 0); > + if (seeding_dev) { > + mutex_unlock(&uuid_mutex); > + up_write(&sb->s_umount); > + } > + goto out; > +} > + <attention span exceeded, sorry> -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html