Re: [PATCH 02/35] Btrfs multi-device code

Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> · Mon, 12 Jan 2009 22:23:45 -0800

On Wed,  7 Jan 2009 22:56:52 -0500 Chris Mason <chris.mason@xxxxxxxxxx> wrote:

> All block and extent pointers in btrfs use a logical address space.  This file
> is responsible for mapping that logical address to a physical block on disk.
> 
> It implements raid0, raid1 and raid10.  The block groups used by extent-tree.c
> are allocated from each device in chunks of ~1GB in size.  This file maintains
> the chunk tree and device allocation tree that are used to record the
> areas of each disk that are in use.
> 
> Each super block has a bootstrapping section that is used to translate
> the extents in the chunk tree.  The chunk tree is used to translate all
> of the other extents in the filesystem to a physical block on disk.

Where would one go to find out about the user-visible management of
these devices?

> This file also provides async bio submission code.  The bios are often
> sent down by async helper threads that are supposed to be working on
> checksumming or other cpu intensive tasks.  We don't want those tasks
> waiting for a free request, so a helper thread is used here that is willing
> to actually wait on the drive.

Seems odd.  Why have a thread to wait on IO?  Why not just
fire-and-forget, and then only block if some caller wishes to use the
results of that IO?

>
> ...
>
> +struct map_lookup {
> +	u64 type;
> +	int io_align;
> +	int io_width;
> +	int stripe_len;
> +	int sector_size;
> +	int num_stripes;
> +	int sub_stripes;
> +	struct btrfs_bio_stripe stripes[];
> +};

this structure looks interesting.

> +static int init_first_rw_device(struct btrfs_trans_handle *trans,
> +				struct btrfs_root *root,
> +				struct btrfs_device *device);
> +static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
> +
> +#define map_lookup_size(n) (sizeof(struct map_lookup) + \
> +			    (sizeof(struct btrfs_bio_stripe) * (n)))
> +
> +static DEFINE_MUTEX(uuid_mutex);
> +static LIST_HEAD(fs_uuids);
> +
> +void btrfs_lock_volumes(void)
> +{
> +	mutex_lock(&uuid_mutex);
> +}
> +
> +void btrfs_unlock_volumes(void)
> +{
> +	mutex_unlock(&uuid_mutex);
> +}
> +
> +static void lock_chunks(struct btrfs_root *root)
> +{
> +	mutex_lock(&root->fs_info->chunk_mutex);
> +}
> +
> +static void unlock_chunks(struct btrfs_root *root)
> +{
> +	mutex_unlock(&root->fs_info->chunk_mutex);
> +}
> +
> +static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
> +{
> +	struct btrfs_device *device;
> +	WARN_ON(fs_devices->opened);
> +	while (!list_empty(&fs_devices->devices)) {
> +		device = list_entry(fs_devices->devices.next,
> +				    struct btrfs_device, dev_list);
> +		list_del(&device->dev_list);
> +		kfree(device->name);
> +		kfree(device);
> +	}
> +	kfree(fs_devices);
> +}

Called under btrfs_lock_volumes(), one assumes/hopes.  A
WARN_ON(!mutex_is_locked()) is a nice way of documenting such things.

> +int btrfs_cleanup_fs_uuids(void)
> +{
> +	struct btrfs_fs_devices *fs_devices;
> +
> +	while (!list_empty(&fs_uuids)) {
> +		fs_devices = list_entry(fs_uuids.next,
> +					struct btrfs_fs_devices, list);
> +		list_del(&fs_devices->list);
> +		free_fs_devices(fs_devices);
> +	}
> +	return 0;
> +}

Ditto.

> +static noinline struct btrfs_device *__find_device(struct list_head *head,
> +						   u64 devid, u8 *uuid)

It's not possible for the code reader to work out why all these
noinlines are here (this reader, at least).

> +{
> +	struct btrfs_device *dev;
> +	struct list_head *cur;
> +
> +	list_for_each(cur, head) {
> +		dev = list_entry(cur, struct btrfs_device, dev_list);
> +		if (dev->devid == devid &&
> +		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
> +			return dev;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
> +{
> +	struct list_head *cur;
> +	struct btrfs_fs_devices *fs_devices;
> +
> +	list_for_each(cur, &fs_uuids) {
> +		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
> +		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
> +			return fs_devices;
> +	}
> +	return NULL;
> +}

Called under btrfs_lock_volumes()...

> +/*
> + * we try to collect pending bios for a device so we don't get a large
> + * number of procs sending bios down to the same device.  This greatly
> + * improves the schedulers ability to collect and merge the bios.

eh?  The block layer already implements limiting.

Why does _not_ sending bios to a device aid the IO scheduler's merging
activity?

The comment needs help, I suspect.

> + * But, it also turns into a long list of bios to process and that is sure
> + * to eventually make the worker thread block.

Why is that bad?  If there's a large amount of IO queued then blocking
seems an appropriate thing to do.

>  The solution here is to
> + * make some progress and then put this work struct back at the end of
> + * the list if the block device is congested.  This way, multiple devices
> + * can make progress from a single worker thread.

hrm, OK.  I think.

> + */
> +static noinline int run_scheduled_bios(struct btrfs_device *device)
> +{
> +	struct bio *pending;
> +	struct backing_dev_info *bdi;
> +	struct btrfs_fs_info *fs_info;
> +	struct bio *tail;
> +	struct bio *cur;
> +	int again = 0;
> +	unsigned long num_run = 0;
> +	unsigned long limit;
> +
> +	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;

device->bdev->bd_inode_backing_dev_info is equivalent and shorter, I suspect.

I forget why we added that.

> +	fs_info = device->dev_root->fs_info;
> +	limit = btrfs_async_submit_limit(fs_info);
> +	limit = limit * 2 / 3;

??

> +loop:
> +	spin_lock(&device->io_lock);
> +
> +	/* take all the bios off the list at once and process them
> +	 * later on (without the lock held).  But, remember the
> +	 * tail and other pointers so the bios can be properly reinserted
> +	 * into the list if we hit congestion
> +	 */
> +	pending = device->pending_bios;
> +	tail = device->pending_bio_tail;
> +	WARN_ON(pending && !tail);
> +	device->pending_bios = NULL;
> +	device->pending_bio_tail = NULL;
> +
> +	/*
> +	 * if pending was null this time around, no bios need processing
> +	 * at all and we can stop.  Otherwise it'll loop back up again
> +	 * and do an additional check so no bios are missed.
> +	 *
> +	 * device->running_pending is used to synchronize with the
> +	 * schedule_bio code.
> +	 */
> +	if (pending) {
> +		again = 1;
> +		device->running_pending = 1;
> +	} else {
> +		again = 0;
> +		device->running_pending = 0;
> +	}
> +	spin_unlock(&device->io_lock);
> +
> +	while (pending) {
> +		cur = pending;
> +		pending = pending->bi_next;
> +		cur->bi_next = NULL;
> +		atomic_dec(&fs_info->nr_async_bios);
> +
> +		if (atomic_read(&fs_info->nr_async_bios) < limit &&
> +		    waitqueue_active(&fs_info->async_submit_wait))
> +			wake_up(&fs_info->async_submit_wait);
> +
> +		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
> +		bio_get(cur);
> +		submit_bio(cur->bi_rw, cur);
> +		bio_put(cur);

What do the bio_get()/bio_put() do?  Looks wrong, or unneeded.

> +		num_run++;
> +
> +		/*
> +		 * we made progress, there is more work to do and the bdi
> +		 * is now congested.  Back off and let other work structs
> +		 * run instead
> +		 */
> +		if (pending && bdi_write_congested(bdi) &&

Does this function handle only writes?  If it handles reads, why do we
test only write congestion?  (please add comments if the questions
aren't painfully stupid).

> +		    fs_info->fs_devices->open_devices > 1) {
> +			struct bio *old_head;
> +
> +			spin_lock(&device->io_lock);
> +
> +			old_head = device->pending_bios;
> +			device->pending_bios = pending;
> +			if (device->pending_bio_tail)
> +				tail->bi_next = old_head;
> +			else
> +				device->pending_bio_tail = tail;
> +
> +			spin_unlock(&device->io_lock);
> +			btrfs_requeue_work(&device->work);
> +			goto done;
> +		}
> +	}
> +	if (again)
> +		goto loop;
> +done:
> +	return 0;
> +}
> +
> +static void pending_bios_fn(struct btrfs_work *work)
> +{
> +	struct btrfs_device *device;
> +
> +	device = container_of(work, struct btrfs_device, work);
> +	run_scheduled_bios(device);
> +}
> +
> +static noinline int device_list_add(const char *path,
> +			   struct btrfs_super_block *disk_super,
> +			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
> +{
> +	struct btrfs_device *device;
> +	struct btrfs_fs_devices *fs_devices;
> +	u64 found_transid = btrfs_super_generation(disk_super);
> +
> +	fs_devices = find_fsid(disk_super->fsid);
> +	if (!fs_devices) {
> +		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);

Wondering why it's GFP_NOFS.

Presumably this function is supposed to be called under
btrfs_lock_volumes(), and we also call that function on the memory
reclaim path?

> +		if (!fs_devices)
> +			return -ENOMEM;
> +		INIT_LIST_HEAD(&fs_devices->devices);
> +		INIT_LIST_HEAD(&fs_devices->alloc_list);
> +		list_add(&fs_devices->list, &fs_uuids);
> +		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
> +		fs_devices->latest_devid = devid;
> +		fs_devices->latest_trans = found_transid;
> +		device = NULL;
> +	} else {
> +		device = __find_device(&fs_devices->devices, devid,
> +				       disk_super->dev_item.uuid);
> +	}
> +	if (!device) {
> +		if (fs_devices->opened)
> +			return -EBUSY;
> +
> +		device = kzalloc(sizeof(*device), GFP_NOFS);
> +		if (!device) {
> +			/* we can safely leave the fs_devices entry around */
> +			return -ENOMEM;
> +		}
> +		device->devid = devid;
> +		device->work.func = pending_bios_fn;
> +		memcpy(device->uuid, disk_super->dev_item.uuid,
> +		       BTRFS_UUID_SIZE);
> +		device->barriers = 1;
> +		spin_lock_init(&device->io_lock);
> +		device->name = kstrdup(path, GFP_NOFS);
> +		if (!device->name) {
> +			kfree(device);
> +			return -ENOMEM;
> +		}
> +		INIT_LIST_HEAD(&device->dev_alloc_list);
> +		list_add(&device->dev_list, &fs_devices->devices);
> +		device->fs_devices = fs_devices;
> +		fs_devices->num_devices++;
> +	}
> +
> +	if (found_transid > fs_devices->latest_trans) {
> +		fs_devices->latest_devid = devid;
> +		fs_devices->latest_trans = found_transid;
> +	}
> +	*fs_devices_ret = fs_devices;
> +	return 0;
> +}
> +
> +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)

I wonder why this function exists.

> +{
> +	struct btrfs_fs_devices *fs_devices;
> +	struct btrfs_device *device;
> +	struct btrfs_device *orig_dev;
> +
> +	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
> +	if (!fs_devices)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&fs_devices->devices);
> +	INIT_LIST_HEAD(&fs_devices->alloc_list);
> +	INIT_LIST_HEAD(&fs_devices->list);
> +	fs_devices->latest_devid = orig->latest_devid;
> +	fs_devices->latest_trans = orig->latest_trans;
> +	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
> +
> +	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
> +		device = kzalloc(sizeof(*device), GFP_NOFS);
> +		if (!device)
> +			goto error;
> +
> +		device->name = kstrdup(orig_dev->name, GFP_NOFS);
> +		if (!device->name)
> +			goto error;

We leaked *device?

> +		device->devid = orig_dev->devid;
> +		device->work.func = pending_bios_fn;
> +		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
> +		device->barriers = 1;
> +		spin_lock_init(&device->io_lock);
> +		INIT_LIST_HEAD(&device->dev_list);
> +		INIT_LIST_HEAD(&device->dev_alloc_list);
> +
> +		list_add(&device->dev_list, &fs_devices->devices);
> +		device->fs_devices = fs_devices;
> +		fs_devices->num_devices++;
> +	}
> +	return fs_devices;
> +error:
> +	free_fs_devices(fs_devices);
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)

whatdoesthisdo

> +{
> +	struct list_head *tmp;
> +	struct list_head *cur;
> +	struct btrfs_device *device;
> +
> +	mutex_lock(&uuid_mutex);

btrfs_lock_volumes()?

> +again:
> +	list_for_each_safe(cur, tmp, &fs_devices->devices) {
> +		device = list_entry(cur, struct btrfs_device, dev_list);
> +		if (device->in_fs_metadata)
> +			continue;
> +
> +		if (device->bdev) {
> +			close_bdev_exclusive(device->bdev, device->mode);
> +			device->bdev = NULL;
> +			fs_devices->open_devices--;
> +		}
> +		if (device->writeable) {
> +			list_del_init(&device->dev_alloc_list);
> +			device->writeable = 0;
> +			fs_devices->rw_devices--;
> +		}

Why are we bothering to modify *device?  It's about to be freed.

> +		list_del_init(&device->dev_list);
> +		fs_devices->num_devices--;
> +		kfree(device->name);
> +		kfree(device);
> +	}
> +
> +	if (fs_devices->seed) {
> +		fs_devices = fs_devices->seed;
> +		goto again;
> +	}
> +
> +	mutex_unlock(&uuid_mutex);

...

> +	return 0;
> +}
> +
> +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
> +{
> +	struct list_head *cur;
> +	struct btrfs_device *device;
> +
> +	if (--fs_devices->opened > 0)
> +		return 0;
> +
> +	list_for_each(cur, &fs_devices->devices) {
> +		device = list_entry(cur, struct btrfs_device, dev_list);
> +		if (device->bdev) {

This reader is wondering what btrfs_device.bdev==NULL means.  How would
one go about determining this?

> +			close_bdev_exclusive(device->bdev, device->mode);
> +			fs_devices->open_devices--;
> +		}
> +		if (device->writeable) {
> +			list_del_init(&device->dev_alloc_list);
> +			fs_devices->rw_devices--;
> +		}
> +
> +		device->bdev = NULL;
> +		device->writeable = 0;
> +		device->in_fs_metadata = 0;

What does in_fs_metadata mean?

Perhaps those three fields should have type `bool'.

> +	}
> +	WARN_ON(fs_devices->open_devices);
> +	WARN_ON(fs_devices->rw_devices);
> +	fs_devices->opened = 0;
> +	fs_devices->seeding = 0;

What does the `seeding' field mean?  It's a boolean, I assume..

> +	return 0;
> +}
> +
> +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
> +{
> +	struct btrfs_fs_devices *seed_devices = NULL;
> +	int ret;
> +
> +	mutex_lock(&uuid_mutex);

btrfs_lock_volumes()?  (Please check whole fs for consistency here)

Seems strange that a lock called "uuid_mutex" is protecting a pile of
things which don't seem very uuid-related.

> +	ret = __btrfs_close_devices(fs_devices);
> +	if (!fs_devices->opened) {
> +		seed_devices = fs_devices->seed;
> +		fs_devices->seed = NULL;
> +	}
> +	mutex_unlock(&uuid_mutex);
> +
> +	while (seed_devices) {
> +		fs_devices = seed_devices;
> +		seed_devices = fs_devices->seed;
> +		__btrfs_close_devices(fs_devices);
> +		free_fs_devices(fs_devices);
> +	}
> +	return ret;
> +}
> +
> +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
> +				fmode_t flags, void *holder)
> +{
> +	struct block_device *bdev;
> +	struct list_head *head = &fs_devices->devices;
> +	struct list_head *cur;
> +	struct btrfs_device *device;
> +	struct block_device *latest_bdev = NULL;
> +	struct buffer_head *bh;
> +	struct btrfs_super_block *disk_super;
> +	u64 latest_devid = 0;
> +	u64 latest_transid = 0;
> +	u64 devid;
> +	int seeding = 1;
> +	int ret = 0;
> +
> +	list_for_each(cur, head) {
> +		device = list_entry(cur, struct btrfs_device, dev_list);

list_for_each_entry()

> +		if (device->bdev)
> +			continue;
> +		if (!device->name)
> +			continue;

What's the algorithm for understanding the above four lines?

> +		bdev = open_bdev_exclusive(device->name, flags, holder);
> +		if (IS_ERR(bdev)) {
> +			printk(KERN_INFO "open %s failed\n", device->name);
> +			goto error;
> +		}
> +		set_blocksize(bdev, 4096);

hm.  What's the situation with btrfs blocksizes?

> +		bh = btrfs_read_dev_super(bdev);
> +		if (!bh)
> +			goto error_close;
> +
> +		disk_super = (struct btrfs_super_block *)bh->b_data;
> +		devid = le64_to_cpu(disk_super->dev_item.devid);
> +		if (devid != device->devid)
> +			goto error_brelse;
> +
> +		if (memcmp(device->uuid, disk_super->dev_item.uuid,
> +			   BTRFS_UUID_SIZE))
> +			goto error_brelse;
> +
> +		device->generation = btrfs_super_generation(disk_super);
> +		if (!latest_transid || device->generation > latest_transid) {
> +			latest_devid = devid;
> +			latest_transid = device->generation;
> +			latest_bdev = bdev;
> +		}
> +
> +		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
> +			device->writeable = 0;
> +		} else {
> +			device->writeable = !bdev_read_only(bdev);
> +			seeding = 0;
> +		}
> +
> +		device->bdev = bdev;
> +		device->in_fs_metadata = 0;
> +		device->mode = flags;
> +
> +		fs_devices->open_devices++;
> +		if (device->writeable) {
> +			fs_devices->rw_devices++;
> +			list_add(&device->dev_alloc_list,
> +				 &fs_devices->alloc_list);
> +		}
> +		continue;
> +
> +error_brelse:
> +		brelse(bh);

brelse() is only needed in situations where the pointer might be NULL.

> +error_close:
> +		close_bdev_exclusive(bdev, FMODE_READ);
> +error:
> +		continue;
> +	}
> +	if (fs_devices->open_devices == 0) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +	fs_devices->seeding = seeding;
> +	fs_devices->opened = 1;
> +	fs_devices->latest_bdev = latest_bdev;
> +	fs_devices->latest_devid = latest_devid;
> +	fs_devices->latest_trans = latest_transid;
> +	fs_devices->total_rw_bytes = 0;
> +out:
> +	return ret;
> +}
> +
> +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
> +		       fmode_t flags, void *holder)
> +{
> +	int ret;
> +
> +	mutex_lock(&uuid_mutex);
> +	if (fs_devices->opened) {
> +		fs_devices->opened++;
> +		ret = 0;
> +	} else {
> +		ret = __btrfs_open_devices(fs_devices, flags, holder);
> +	}
> +	mutex_unlock(&uuid_mutex);
> +	return ret;
> +}
> +
> +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
> +			  struct btrfs_fs_devices **fs_devices_ret)
> +{
> +	struct btrfs_super_block *disk_super;
> +	struct block_device *bdev;
> +	struct buffer_head *bh;
> +	int ret;
> +	u64 devid;
> +	u64 transid;
> +
> +	mutex_lock(&uuid_mutex);
> +
> +	bdev = open_bdev_exclusive(path, flags, holder);
> +
> +	if (IS_ERR(bdev)) {
> +		ret = PTR_ERR(bdev);
> +		goto error;
> +	}
> +
> +	ret = set_blocksize(bdev, 4096);
> +	if (ret)
> +		goto error_close;
> +	bh = btrfs_read_dev_super(bdev);
> +	if (!bh) {
> +		ret = -EIO;
> +		goto error_close;
> +	}
> +	disk_super = (struct btrfs_super_block *)bh->b_data;

<looks at struct btrfs_super_block>

It needs work, I think.  This:

	__le64 compat_ro_flags;
	__le64 incompat_flags;
	__le16 csum_type;
	u8 root_level;
	u8 chunk_root_level;
	u8 log_root_level;
	struct btrfs_dev_item dev_item;

doesn't look like it will successfully port between architectures. 
Also `struct btrfs_dev_item' has three le32's wedged between le64's.

I suppose that the __packed will help in some cases (but not the
btrfs_super_block, surely).  However it seems unwise/incautious/ugly to
rely upon __packed in this manner.

Please do s/__attribute__ ((__packed__))/__packed/g

> +	devid = le64_to_cpu(disk_super->dev_item.devid);
> +	transid = btrfs_super_generation(disk_super);
> +	if (disk_super->label[0])
> +		printk(KERN_INFO "device label %s ", disk_super->label);
> +	else {
> +		/* FIXME, make a readl uuid parser */

what's that?

> +		printk(KERN_INFO "device fsid %llx-%llx ",
> +		       *(unsigned long long *)disk_super->fsid,
> +		       *(unsigned long long *)(disk_super->fsid + 8));
> +	}
> +	printk(KERN_INFO "devid %llu transid %llu %s\n",
> +	       (unsigned long long)devid, (unsigned long long)transid, path);
> +	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
> +
> +	brelse(bh);

bh is non-NULL here

> +error_close:
> +	close_bdev_exclusive(bdev, flags);
> +error:
> +	mutex_unlock(&uuid_mutex);
> +	return ret;
> +}
> +
> +/*
> + * this uses a pretty simple search, the expectation is that it is
> + * called very infrequently and that a given device has a small number
> + * of extents
> + */
> +static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
> +					 struct btrfs_device *device,
> +					 u64 num_bytes, u64 *start)
> +{
> +	struct btrfs_key key;
> +	struct btrfs_root *root = device->dev_root;
> +	struct btrfs_dev_extent *dev_extent = NULL;
> +	struct btrfs_path *path;
> +	u64 hole_size = 0;
> +	u64 last_byte = 0;
> +	u64 search_start = 0;
> +	u64 search_end = device->total_bytes;
> +	int ret;
> +	int slot = 0;
> +	int start_found;
> +	struct extent_buffer *l;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +	path->reada = 2;
> +	start_found = 0;
> +
> +	/* FIXME use last free of some kind */
> +
> +	/* we don't want to overwrite the superblock on the drive,
> +	 * so we make sure to start at an offset of at least 1MB
> +	 */
> +	search_start = max((u64)1024 * 1024, search_start);

max_t

> +	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
> +		search_start = max(root->fs_info->alloc_start, search_start);
> +
> +	key.objectid = device->devid;
> +	key.offset = search_start;
> +	key.type = BTRFS_DEV_EXTENT_KEY;
> +	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto error;
> +	ret = btrfs_previous_item(root, path, 0, key.type);
> +	if (ret < 0)
> +		goto error;
> +	l = path->nodes[0];
> +	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
> +	while (1) {
> +		l = path->nodes[0];
> +		slot = path->slots[0];
> +		if (slot >= btrfs_header_nritems(l)) {
> +			ret = btrfs_next_leaf(root, path);
> +			if (ret == 0)
> +				continue;
> +			if (ret < 0)
> +				goto error;
> +no_more_items:
> +			if (!start_found) {
> +				if (search_start >= search_end) {
> +					ret = -ENOSPC;

Hey.  Shouldn't that go BUG()?

> +					goto error;
> +				}
> +				*start = search_start;
> +				start_found = 1;
> +				goto check_pending;
> +			}
> +			*start = last_byte > search_start ?
> +				last_byte : search_start;

max()?

> +			if (search_end <= *start) {
> +				ret = -ENOSPC;
> +				goto error;
> +			}
> +			goto check_pending;
> +		}
> +		btrfs_item_key_to_cpu(l, &key, slot);
> +
> +		if (key.objectid < device->devid)
> +			goto next;
> +
> +		if (key.objectid > device->devid)
> +			goto no_more_items;
> +
> +		if (key.offset >= search_start && key.offset > last_byte &&
> +		    start_found) {
> +			if (last_byte < search_start)
> +				last_byte = search_start;
> +			hole_size = key.offset - last_byte;
> +			if (key.offset > last_byte &&
> +			    hole_size >= num_bytes) {
> +				*start = last_byte;
> +				goto check_pending;
> +			}
> +		}
> +		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
> +			goto next;
> +
> +		start_found = 1;
> +		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
> +		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
> +next:
> +		path->slots[0]++;
> +		cond_resched();
> +	}
> +check_pending:
> +	/* we have to make sure we didn't find an extent that has already
> +	 * been allocated by the map tree or the original allocation
> +	 */
> +	BUG_ON(*start < search_start);

Is it triggerable by on-disk corruption?

> +	if (*start + num_bytes > search_end) {
> +		ret = -ENOSPC;
> +		goto error;
> +	}
> +	/* check for pending inserts here */
> +	ret = 0;
> +
> +error:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
> +			  struct btrfs_device *device,
> +			  u64 start)
> +{
> +	int ret;
> +	struct btrfs_path *path;
> +	struct btrfs_root *root = device->dev_root;
> +	struct btrfs_key key;
> +	struct btrfs_key found_key;
> +	struct extent_buffer *leaf = NULL;
> +	struct btrfs_dev_extent *extent = NULL;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = device->devid;
> +	key.offset = start;
> +	key.type = BTRFS_DEV_EXTENT_KEY;
> +
> +	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);

<looks at btrfs_search_slot>

 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)

The (ab)use of the sign of formal arg `ins_len' seems a bit nasty?

> +	if (ret > 0) {
> +		ret = btrfs_previous_item(root, path, key.objectid,
> +					  BTRFS_DEV_EXTENT_KEY);
> +		BUG_ON(ret);
> +		leaf = path->nodes[0];
> +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +		extent = btrfs_item_ptr(leaf, path->slots[0],
> +					struct btrfs_dev_extent);
> +		BUG_ON(found_key.offset > start || found_key.offset +
> +		       btrfs_dev_extent_length(leaf, extent) < start);
> +		ret = 0;
> +	} else if (ret == 0) {
> +		leaf = path->nodes[0];
> +		extent = btrfs_item_ptr(leaf, path->slots[0],
> +					struct btrfs_dev_extent);
> +	}
> +	BUG_ON(ret);
> +
> +	if (device->bytes_used > 0)
> +		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
> +	ret = btrfs_del_item(trans, root, path);
> +	BUG_ON(ret);
> +
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>
> ...
>
> +static noinline int find_next_chunk(struct btrfs_root *root,
> +				    u64 objectid, u64 *offset)
> +{
> +	struct btrfs_path *path;
> +	int ret;
> +	struct btrfs_key key;
> +	struct btrfs_chunk *chunk;
> +	struct btrfs_key found_key;
> +
> +	path = btrfs_alloc_path();
> +	BUG_ON(!path);

Stuff like this will eventually have to be weeded out and fixed.  It at
least needs a big FIXME the instant it is typed in.  Quietly slipping
it in there in this manner is just creating more work for yourselves.

> +	key.objectid = objectid;
> +	key.offset = (u64)-1;
> +	key.type = BTRFS_CHUNK_ITEM_KEY;
> +
> +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto error;
> +
> +	BUG_ON(ret == 0);

Is this triggerable by a corrupted fs?

> +	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
> +	if (ret) {
> +		*offset = 0;
> +	} else {
> +		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
> +				      path->slots[0]);
> +		if (found_key.objectid != objectid)
> +			*offset = 0;
> +		else {
> +			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +					       struct btrfs_chunk);
> +			*offset = found_key.offset +
> +				btrfs_chunk_length(path->nodes[0], chunk);
> +		}
> +	}
> +	ret = 0;
> +error:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>
> ...
>
> +
> +/*
> + * the device information is stored in the chunk root
> + * the btrfs_device struct should be fully filled in
> + */
> +int btrfs_add_device(struct btrfs_trans_handle *trans,
> +		     struct btrfs_root *root,
> +		     struct btrfs_device *device)
> +{
> +	int ret;
> +	struct btrfs_path *path;
> +	struct btrfs_dev_item *dev_item;
> +	struct extent_buffer *leaf;
> +	struct btrfs_key key;
> +	unsigned long ptr;
> +
> +	root = root->fs_info->chunk_root;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
> +	key.type = BTRFS_DEV_ITEM_KEY;
> +	key.offset = device->devid;
> +
> +	ret = btrfs_insert_empty_item(trans, root, path, &key,
> +				      sizeof(*dev_item));
> +	if (ret)
> +		goto out;
> +
> +	leaf = path->nodes[0];
> +	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
> +
> +	btrfs_set_device_id(leaf, dev_item, device->devid);
> +	btrfs_set_device_generation(leaf, dev_item, 0);
> +	btrfs_set_device_type(leaf, dev_item, device->type);
> +	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
> +	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
> +	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
> +	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
> +	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
> +	btrfs_set_device_group(leaf, dev_item, 0);
> +	btrfs_set_device_seek_speed(leaf, dev_item, 0);
> +	btrfs_set_device_bandwidth(leaf, dev_item, 0);
> +	btrfs_set_device_start_offset(leaf, dev_item, 0);
> +
> +	ptr = (unsigned long)btrfs_device_uuid(dev_item);
> +	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
> +	ptr = (unsigned long)btrfs_device_fsid(dev_item);
> +	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
> +	btrfs_mark_buffer_dirty(leaf);

I expected a function called btrfs_mark_buffer_dirty() to be a wrapper
around mark_buffer_dirty().

> +	ret = 0;
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>
> ...
>
> +
> +int btrfs_rm_device(struct btrfs_root *root, char *device_path)
> +{
> +	struct btrfs_device *device;
> +	struct btrfs_device *next_device;
> +	struct block_device *bdev;
> +	struct buffer_head *bh = NULL;
> +	struct btrfs_super_block *disk_super;
> +	u64 all_avail;
> +	u64 devid;
> +	u64 num_devices;
> +	u8 *dev_uuid;
> +	int ret = 0;
> +
> +	mutex_lock(&uuid_mutex);

btrfs_lock_volumes()

> +	mutex_lock(&root->fs_info->volume_mutex);

lock_chunks() (which needs renaming)

> +	all_avail = root->fs_info->avail_data_alloc_bits |
> +		root->fs_info->avail_system_alloc_bits |
> +		root->fs_info->avail_metadata_alloc_bits;
> +
> +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
> +	    root->fs_info->fs_devices->rw_devices <= 4) {
> +		printk(KERN_ERR "btrfs: unable to go below four devices "
> +		       "on raid10\n");
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
> +	    root->fs_info->fs_devices->rw_devices <= 2) {
> +		printk(KERN_ERR "btrfs: unable to go below two "
> +		       "devices on raid1\n");
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (strcmp(device_path, "missing") == 0) {

<wonders what interface all this is implementing>

> +		struct list_head *cur;
> +		struct list_head *devices;
> +		struct btrfs_device *tmp;
> +
> +		device = NULL;
> +		devices = &root->fs_info->fs_devices->devices;
> +		list_for_each(cur, devices) {
> +			tmp = list_entry(cur, struct btrfs_device, dev_list);
> +			if (tmp->in_fs_metadata && !tmp->bdev) {
> +				device = tmp;
> +				break;
> +			}
> +		}
> +		bdev = NULL;
> +		bh = NULL;
> +		disk_super = NULL;
> +		if (!device) {
> +			printk(KERN_ERR "btrfs: no missing devices found to "
> +			       "remove\n");
> +			goto out;
> +		}
> +	} else {
> +		bdev = open_bdev_exclusive(device_path, FMODE_READ,
> +				      root->fs_info->bdev_holder);
> +		if (IS_ERR(bdev)) {
> +			ret = PTR_ERR(bdev);
> +			goto out;
> +		}
> +
> +		set_blocksize(bdev, 4096);
> +		bh = btrfs_read_dev_super(bdev);
> +		if (!bh) {
> +			ret = -EIO;
> +			goto error_close;
> +		}
> +		disk_super = (struct btrfs_super_block *)bh->b_data;
> +		devid = le64_to_cpu(disk_super->dev_item.devid);
> +		dev_uuid = disk_super->dev_item.uuid;
> +		device = btrfs_find_device(root, devid, dev_uuid,
> +					   disk_super->fsid);
> +		if (!device) {
> +			ret = -ENOENT;
> +			goto error_brelse;
> +		}
> +	}
> +
> +	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
> +		printk(KERN_ERR "btrfs: unable to remove the only writeable "
> +		       "device\n");
> +		ret = -EINVAL;
> +		goto error_brelse;
> +	}
> +
> +	if (device->writeable) {
> +		list_del_init(&device->dev_alloc_list);
> +		root->fs_info->fs_devices->rw_devices--;
> +	}
> +
> +	ret = btrfs_shrink_device(device, 0);
> +	if (ret)
> +		goto error_brelse;
> +
> +	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
> +	if (ret)
> +		goto error_brelse;
> +
> +	device->in_fs_metadata = 0;
> +	list_del_init(&device->dev_list);
> +	device->fs_devices->num_devices--;
> +
> +	next_device = list_entry(root->fs_info->fs_devices->devices.next,
> +				 struct btrfs_device, dev_list);
> +	if (device->bdev == root->fs_info->sb->s_bdev)
> +		root->fs_info->sb->s_bdev = next_device->bdev;
> +	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
> +		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
> +
> +	if (device->bdev) {
> +		close_bdev_exclusive(device->bdev, device->mode);
> +		device->bdev = NULL;
> +		device->fs_devices->open_devices--;
> +	}
> +
> +	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
> +	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
> +
> +	if (device->fs_devices->open_devices == 0) {
> +		struct btrfs_fs_devices *fs_devices;
> +		fs_devices = root->fs_info->fs_devices;
> +		while (fs_devices) {
> +			if (fs_devices->seed == device->fs_devices)
> +				break;
> +			fs_devices = fs_devices->seed;
> +		}
> +		fs_devices->seed = device->fs_devices->seed;
> +		device->fs_devices->seed = NULL;
> +		__btrfs_close_devices(device->fs_devices);
> +		free_fs_devices(device->fs_devices);
> +	}
> +
> +	/*
> +	 * at this point, the device is zero sized.  We want to
> +	 * remove it from the devices list and zero out the old super
> +	 */
> +	if (device->writeable) {
> +		/* make sure this device isn't detected as part of
> +		 * the FS anymore
> +		 */
> +		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> +		set_buffer_dirty(bh);
> +		sync_dirty_buffer(bh);
> +	}
> +
> +	kfree(device->name);
> +	kfree(device);
> +	ret = 0;
> +
> +error_brelse:
> +	brelse(bh);
> +error_close:
> +	if (bdev)
> +		close_bdev_exclusive(bdev, FMODE_READ);
> +out:
> +	mutex_unlock(&root->fs_info->volume_mutex);
> +	mutex_unlock(&uuid_mutex);
> +	return ret;
> +}
> +
> +/*
> + * does all the dirty work required for changing file system's UUID.
> + */
> +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
> +				struct btrfs_root *root)
> +{
> +	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
> +	struct btrfs_fs_devices *old_devices;
> +	struct btrfs_fs_devices *seed_devices;
> +	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
> +	struct btrfs_device *device;
> +	u64 super_flags;
> +
> +	BUG_ON(!mutex_is_locked(&uuid_mutex));

Given the (regrettable?) existence of btrfs_lock_volumes() and
btrfs_unlock_volumes(), it would make sense to add another regrettable
wrapper for the above operation.

> +	if (!fs_devices->seeding)
> +		return -EINVAL;

<Again wonders what "seeding" means>

> +	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
> +	if (!seed_devices)
> +		return -ENOMEM;
> +
> +	old_devices = clone_fs_devices(fs_devices);
> +	if (IS_ERR(old_devices)) {
> +		kfree(seed_devices);
> +		return PTR_ERR(old_devices);
> +	}
> +
> +	list_add(&old_devices->list, &fs_uuids);
> +
> +	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
> +	seed_devices->opened = 1;
> +	INIT_LIST_HEAD(&seed_devices->devices);
> +	INIT_LIST_HEAD(&seed_devices->alloc_list);
> +	list_splice_init(&fs_devices->devices, &seed_devices->devices);
> +	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
> +	list_for_each_entry(device, &seed_devices->devices, dev_list) {
> +		device->fs_devices = seed_devices;
> +	}

unneeded braces

<runs checkpatch>

<checkpatch misses this - regression?>

#2001: FILE: fs/btrfs/volumes.c:1930:
+               return calc_size * num_stripes;
                                 ^

ERROR: space prohibited after that '*' (ctx:WxW)
#2072: FILE: fs/btrfs/volumes.c:2001:
+               max_chunk_size = calc_size * 2;
                                           ^

ERROR: space prohibited after that '*' (ctx:WxW)
#2089: FILE: fs/btrfs/volumes.c:2018:
+       if (calc_size * num_stripes > max_chunk_size) {
	                      ^

ERROR: space prohibited after that '*' (ctx:WxW)
#2105: FILE: fs/btrfs/volumes.c:2034:
+               min_free = calc_size * 2;
	                                     ^

hm, they're all wrong.	

> +	fs_devices->seeding = 0;
> +	fs_devices->num_devices = 0;
> +	fs_devices->open_devices = 0;
> +	fs_devices->seed = seed_devices;
> +
> +	generate_random_uuid(fs_devices->fsid);
> +	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
> +	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
> +	super_flags = btrfs_super_flags(disk_super) &
> +		      ~BTRFS_SUPER_FLAG_SEEDING;
> +	btrfs_set_super_flags(disk_super, super_flags);
> +
> +	return 0;
> +}
> +
>
> ...
>
> +
> +int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
> +{
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_device *device;
> +	struct block_device *bdev;
> +	struct list_head *cur;
> +	struct list_head *devices;
> +	struct super_block *sb = root->fs_info->sb;
> +	u64 total_bytes;
> +	int seeding_dev = 0;
> +	int ret = 0;
> +
> +	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
> +		return -EINVAL;
> +
> +	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
> +	if (!bdev)
> +		return -EIO;
> +
> +	if (root->fs_info->fs_devices->seeding) {
> +		seeding_dev = 1;
> +		down_write(&sb->s_umount);
> +		mutex_lock(&uuid_mutex);
> +	}
> +
> +	filemap_write_and_wait(bdev->bd_inode->i_mapping);

I'm surprised that this can be safely done under ->s_umount.

> +	mutex_lock(&root->fs_info->volume_mutex);
> +
> +	devices = &root->fs_info->fs_devices->devices;
> +	list_for_each(cur, devices) {
> +		device = list_entry(cur, struct btrfs_device, dev_list);
> +		if (device->bdev == bdev) {
> +			ret = -EEXIST;
> +			goto error;
> +		}
> +	}
> +
> +	device = kzalloc(sizeof(*device), GFP_NOFS);
> +	if (!device) {
> +		/* we can safely leave the fs_devices entry around */
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	device->name = kstrdup(device_path, GFP_NOFS);
> +	if (!device->name) {
> +		kfree(device);
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	ret = find_next_devid(root, &device->devid);
> +	if (ret) {
> +		kfree(device);

Leaks device->name

> +		goto error;
> +	}
> +
> +	trans = btrfs_start_transaction(root, 1);
> +	lock_chunks(root);
> +
> +	device->barriers = 1;
> +	device->writeable = 1;
> +	device->work.func = pending_bios_fn;

Should we be using INIT_WORK() here?  It has lockdep implications.
(Multiple instances).

> +	generate_random_uuid(device->uuid);
> +	spin_lock_init(&device->io_lock);
> +	device->generation = trans->transid;
> +	device->io_width = root->sectorsize;
> +	device->io_align = root->sectorsize;
> +	device->sector_size = root->sectorsize;
> +	device->total_bytes = i_size_read(bdev->bd_inode);
> +	device->dev_root = root->fs_info->dev_root;
> +	device->bdev = bdev;
> +	device->in_fs_metadata = 1;
> +	device->mode = 0;
> +	set_blocksize(device->bdev, 4096);
> +
> +	if (seeding_dev) {
> +		sb->s_flags &= ~MS_RDONLY;
> +		ret = btrfs_prepare_sprout(trans, root);
> +		BUG_ON(ret);
> +	}
> +
> +	device->fs_devices = root->fs_info->fs_devices;
> +	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
> +	list_add(&device->dev_alloc_list,
> +		 &root->fs_info->fs_devices->alloc_list);
> +	root->fs_info->fs_devices->num_devices++;
> +	root->fs_info->fs_devices->open_devices++;
> +	root->fs_info->fs_devices->rw_devices++;
> +	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
> +
> +	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
> +	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
> +				    total_bytes + device->total_bytes);
> +
> +	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
> +	btrfs_set_super_num_devices(&root->fs_info->super_copy,
> +				    total_bytes + 1);
> +
> +	if (seeding_dev) {
> +		ret = init_first_rw_device(trans, root, device);
> +		BUG_ON(ret);
> +		ret = btrfs_finish_sprout(trans, root);
> +		BUG_ON(ret);
> +	} else {
> +		ret = btrfs_add_device(trans, root, device);
> +	}
> +
> +	unlock_chunks(root);
> +	btrfs_commit_transaction(trans, root);
> +
> +	if (seeding_dev) {
> +		mutex_unlock(&uuid_mutex);
> +		up_write(&sb->s_umount);
> +
> +		ret = btrfs_relocate_sys_chunks(root);
> +		BUG_ON(ret);
> +	}
> +out:
> +	mutex_unlock(&root->fs_info->volume_mutex);
> +	return ret;
> +error:
> +	close_bdev_exclusive(bdev, 0);
> +	if (seeding_dev) {
> +		mutex_unlock(&uuid_mutex);
> +		up_write(&sb->s_umount);
> +	}
> +	goto out;
> +}
> +

<attention span exceeded, sorry>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html