Re: [RFC PATCH] bcache: enable zoned device support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2019/12/06 9:22, Eric Wheeler wrote:
> On Thu, 5 Dec 2019, Coly Li wrote:
>> This is a very basic zoned device support. With this patch, bcache
>> device is able to,
>> - Export zoned device attribution via sysfs
>> - Response report zones request, e.g. by command 'blkzone report'
>> But the bcache device is still NOT able to,
>> - Response any zoned device management request or IOCTL command
>>
>> Here are the testings I have done,
>> - read /sys/block/bcache0/queue/zoned, content is 'host-managed'
>> - read /sys/block/bcache0/queue/nr_zones, content is number of zones
>>   including all zone types.
>> - read /sys/block/bcache0/queue/chunk_sectors, content is zone size
>>   in sectors.
>> - run 'blkzone report /dev/bcache0', all zones information displayed.
>> - run 'blkzone reset /dev/bcache0', operation is rejected with error
>>   information: "blkzone: /dev/bcache0: BLKRESETZONE ioctl failed:
>>   Operation not supported"
>> - Sequential writes by dd, I can see some zones' write pointer 'wptr'
>>   values updated.
>>
>> All of these are very basic testings, if you have better testing
>> tools or cases, please offer me hint.
> 
> Interesting. 
> 
> 1. should_writeback() could benefit by hinting true when an IO would fall 
>    in a zoned region.
> 
> 2. The writeback thread could writeback such that they prefer 
>    fully(mostly)-populated zones when choosing what to write out.

That definitely would be a good idea since that would certainly benefit
backend-GC (that will be needed).

However, I do not see the point in exposing the /dev/bcacheX block
device itself as a zoned disk. In fact, I think we want exactly the
opposite: expose it as a regular disk so that any FS or application can
run. If the bcache backend disk is zoned, then the writeback handles
sequential writes. This would be in the end a solution similar to
dm-zoned, that is, a zoned disk becomes useable as a regular block
device (random writes anywhere are possible), but likely far more
efficient and faster. That may result in imposing some limitations on
bcache operations though, e.g. it can only be setup with writeback, no
writethrough allowed (not sure though...).
Thoughts ?

> 
> --
> Eric Wheeler
> 
> 
> 
>> Thanks in advance for your review and comments.
>>
>> Signed-off-by: Coly Li <colyli@xxxxxxx>
>> CC: Damien Le Moal <damien.lemoal@xxxxxxx>
>> CC: Hannes Reinecke <hare@xxxxxxxx>
>> ---
>>  drivers/md/bcache/bcache.h  | 10 ++++++
>>  drivers/md/bcache/request.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
>>  drivers/md/bcache/super.c   | 41 +++++++++++++++++++++++--
>>  3 files changed, 122 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 9198c1b480d9..77c2040c99ee 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -221,6 +221,7 @@ BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
>>  struct search;
>>  struct btree;
>>  struct keybuf;
>> +struct bch_report_zones_args;
>>  
>>  struct keybuf_key {
>>  	struct rb_node		node;
>> @@ -277,6 +278,8 @@ struct bcache_device {
>>  			  struct bio *bio, unsigned int sectors);
>>  	int (*ioctl)(struct bcache_device *d, fmode_t mode,
>>  		     unsigned int cmd, unsigned long arg);
>> +	int (*report_zones)(struct bch_report_zones_args *args,
>> +			    unsigned int nr_zones);
>>  };
>>  
>>  struct io {
>> @@ -743,6 +746,13 @@ struct bbio {
>>  	struct bio		bio;
>>  };
>>  
>> +struct bch_report_zones_args {
>> +	struct bcache_device *bcache_device;
>> +	sector_t sector;
>> +	void *orig_data;
>> +	report_zones_cb orig_cb;
>> +};
>> +
>>  #define BTREE_PRIO		USHRT_MAX
>>  #define INITIAL_PRIO		32768U
>>  
>> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
>> index 7555e4a93145..d82425300383 100644
>> --- a/drivers/md/bcache/request.c
>> +++ b/drivers/md/bcache/request.c
>> @@ -1162,6 +1162,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>  		}
>>  	}
>>  
>> +	/*
>> +	 * zone management request may change the data layout and content
>> +	 * implicitly on backing device, which introduces unacceptible
>> +	 * inconsistency between the backing device and the cache device.
>> +	 * Therefore all zone management related request will be denied here.
>> +	 */
>> +	if (op_is_zone_mgmt(bio->bi_opf)) {
>> +		pr_err_ratelimited("zoned device management request denied.");
>> +		bio->bi_status = BLK_STS_NOTSUPP;
>> +		bio_endio(bio);
>> +		return BLK_QC_T_NONE;
>> +	}
>> +
>>  	generic_start_io_acct(q,
>>  			      bio_op(bio),
>>  			      bio_sectors(bio),
>> @@ -1205,6 +1218,24 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
>>  	if (dc->io_disable)
>>  		return -EIO;
>>  
>> +	/*
>> +	 * zone management ioctl commands may change the data layout
>> +	 * and content implicitly on backing device, which introduces
>> +	 * unacceptible inconsistency between the backing device and
>> +	 * the cache device. Therefore all zone management related
>> +	 * ioctl commands will be denied here.
>> +	 */
>> +	switch (cmd) {
>> +	case BLKRESETZONE:
>> +	case BLKOPENZONE:
>> +	case BLKCLOSEZONE:
>> +	case BLKFINISHZONE:
>> +		return -EOPNOTSUPP;
>> +	default:
>> +		/* Other commands fall through*/
>> +		break;
>> +	}
>> +
>>  	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
>>  }
>>  
>> @@ -1233,6 +1264,48 @@ static int cached_dev_congested(void *data, int bits)
>>  	return ret;
>>  }
>>  
>> +static int cached_dev_report_zones_cb(struct blk_zone *zone,
>> +				      unsigned int idx,
>> +				      void *data)
>> +{
>> +	struct bch_report_zones_args *args = data;
>> +	struct bcache_device *d = args->bcache_device;
>> +	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
>> +	unsigned long data_offset = dc->sb.data_offset;
>> +
>> +	if (zone->start >= data_offset) {
>> +		zone->start -= data_offset;
>> +		zone->wp -= data_offset;
>> +	} else {
>> +		/*
>> +		 * Normally the first zone is conventional zone,
>> +		 * we don't need to worry about how to maintain
>> +		 * the write pointer.
>> +		 * But the zone->len is special, because the
>> +		 * sector 0 on bcache device is 8KB offset on
>> +		 * backing device indeed.
>> +		 */
>> +		zone->len -= data_offset;
>> +	}
>> +
>> +	return args->orig_cb(zone, idx, args->orig_data);
>> +}
>> +
>> +static int cached_dev_report_zones(struct bch_report_zones_args *args,
>> +				   unsigned int nr_zones)
>> +{
>> +	struct bcache_device *d = args->bcache_device;
>> +	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
>> +	sector_t sector = args->sector + dc->sb.data_offset;
>> +
>> +	/* sector is real LBA of backing device */
>> +	return blkdev_report_zones(dc->bdev,
>> +				   sector,
>> +				   nr_zones,
>> +				   cached_dev_report_zones_cb,
>> +				   args);
>> +}
>> +
>>  void bch_cached_dev_request_init(struct cached_dev *dc)
>>  {
>>  	struct gendisk *g = dc->disk.disk;
>> @@ -1241,6 +1314,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
>>  	g->queue->backing_dev_info->congested_fn = cached_dev_congested;
>>  	dc->disk.cache_miss			= cached_dev_cache_miss;
>>  	dc->disk.ioctl				= cached_dev_ioctl;
>> +	dc->disk.report_zones			= cached_dev_report_zones;
>>  }
>>  
>>  /* Flash backed devices */
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index 77e9869345e7..7222fcafaf50 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -672,10 +672,32 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
>>  	return d->ioctl(d, mode, cmd, arg);
>>  }
>>  
>> +
>> +static int report_zones_dev(struct gendisk *disk,
>> +			    sector_t sector,
>> +			    unsigned int nr_zones,
>> +			    report_zones_cb cb,
>> +			    void *data)
>> +{
>> +	struct bcache_device *d = disk->private_data;
>> +	struct bch_report_zones_args args = {
>> +		.bcache_device = d,
>> +		.sector = sector,
>> +		.orig_data = data,
>> +		.orig_cb = cb,
>> +	};
>> +
>> +	if (d->report_zones)
>> +		return d->report_zones(&args, nr_zones);
>> +
>> +	return -EOPNOTSUPP;
>> +}
>> +
>>  static const struct block_device_operations bcache_ops = {
>>  	.open		= open_dev,
>>  	.release	= release_dev,
>>  	.ioctl		= ioctl_dev,
>> +	.report_zones	= report_zones_dev,
>>  	.owner		= THIS_MODULE,
>>  };
>>  
>> @@ -808,7 +830,9 @@ static void bcache_device_free(struct bcache_device *d)
>>  	closure_debug_destroy(&d->cl);
>>  }
>>  
>> -static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
>> +static int bcache_device_init(struct cached_dev *dc,
>> +			      struct bcache_device *d,
>> +			      unsigned int block_size,
>>  			      sector_t sectors)
>>  {
>>  	struct request_queue *q;
>> @@ -882,6 +906,17 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
>>  
>>  	blk_queue_write_cache(q, true, true);
>>  
>> +	/*
>> +	 * If this is for backing device registration, and it is an
>> +	 * zoned device (e.g. host-managed S.M.R. hard drive), set
>> +	 * up zoned device attribution properly for sysfs interface.
>> +	 */
>> +	if (dc && bdev_is_zoned(dc->bdev)) {
>> +		q->limits.zoned = bdev_zoned_model(dc->bdev);
>> +		q->nr_zones = blkdev_nr_zones(dc->bdev);
>> +		q->limits.chunk_sectors = bdev_zone_sectors(dc->bdev);
>> +	}
>> +
>>  	return 0;
>>  
>>  err:
>> @@ -1328,7 +1363,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
>>  		dc->partial_stripes_expensive =
>>  			q->limits.raid_partial_stripes_expensive;
>>  
>> -	ret = bcache_device_init(&dc->disk, block_size,
>> +	ret = bcache_device_init(dc, &dc->disk, block_size,
>>  			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
>>  	if (ret)
>>  		return ret;
>> @@ -1445,7 +1480,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
>>  
>>  	kobject_init(&d->kobj, &bch_flash_dev_ktype);
>>  
>> -	if (bcache_device_init(d, block_bytes(c), u->sectors))
>> +	if (bcache_device_init(NULL, d, block_bytes(c), u->sectors))
>>  		goto err;
>>  
>>  	bcache_device_attach(d, c, u - c->uuids);
>> -- 
>> 2.16.4
>>
>>
> 


-- 
Damien Le Moal
Western Digital Research




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux