Re: [PATCH 2 of 8] block: Export I/O topology for block devices and partitions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Apr 23 2009, Martin K. Petersen wrote:
> To support devices with physical block sizes bigger than 512 bytes we
> need to ensure proper alignment.  This patch adds support for exposing
> I/O topology characteristics as devices are stacked.
> 
>   hardsect_size remains unchanged.  It is the smallest atomic unit the
>   device can address (i.e. logical block size).
> 
>   granularity indicates the smallest I/O the device can access without
>   incurring a read-modify-write penalty.  The granularity is set by
>   low-level drivers from then on it is purely internal to the stacking
>   logic.
> 
>   The min_io parameter is the smallest preferred I/O size reported by
>   the device.  In many cases this is the same as granularity.  However,
>   the min_io parameter can be scaled up when stacking (RAID5 chunk
>   size > physical sector size).  min_io is available in sysfs.
> 
>   The opt_io characteristic indicates the preferred I/O size reported by
>   the device.  This is usually the stripe width for arrays.  The value
>   is in sysfs.
> 
>   The alignment parameter indicates the number of bytes the start of the
>   device/partition is offset from the device granularity.  Partition
>   tools and MD/DM tools can use this to align filesystems to the proper
>   boundaries.
> 
> Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx>
> 
> ---
> 6 files changed, 204 insertions(+), 4 deletions(-)
> block/blk-settings.c   |  135 ++++++++++++++++++++++++++++++++++++++++++++++--
> block/blk-sysfs.c      |   22 +++++++
> block/genhd.c          |   10 +++
> fs/partitions/check.c  |   10 +++
> include/linux/blkdev.h |   30 ++++++++++
> include/linux/genhd.h  |    1 
> 
> 
> 
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -292,22 +292,87 @@ EXPORT_SYMBOL(blk_queue_max_segment_size
>   *
>   * Description:
>   *   This should typically be set to the lowest possible sector size
> - *   that the hardware can operate on (possible without reverting to
> - *   even internal read-modify-write operations). Usually the default
> - *   of 512 covers most hardware.
> + *   (logical block size) that the hardware can operate on.  Usually the
> + *   default of 512 covers most hardware.
>   **/
>  void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
>  {
> -	q->hardsect_size = size;
> +	q->hardsect_size = q->granularity = size;
>  }
>  EXPORT_SYMBOL(blk_queue_hardsect_size);
>  
> +/**
> + * blk_queue_granularity - set I/O granularity for the queue
> + * @q:  the request queue for the device
> + * @size:  the I/O granularity, in bytes
> + *
> + * Description:
> + *   This should typically be set to the lowest possible sector size
> + *   that the hardware can operate on without reverting to
> + *   read-modify-write operations.
> + **/
> +void blk_queue_granularity(struct request_queue *q, unsigned short size)
> +{
> +	q->granularity = size;
> +}
> +EXPORT_SYMBOL(blk_queue_granularity);
> +
> +/**
> + * blk_queue_alignment - set alignment for the queue
> + * @q:  the request queue for the device
> + * @alignment:  alignment offset in bytes
> + *
> + * Description:
> + *   Some devices are naturally misaligned to compensate for things like
> + *   the legacy DOS partition table 63-sector offset.  Low-level drivers
> + *   should call this function for devices whose first sector is not
> + *   naturally aligned.
> + */
> +void blk_queue_alignment(struct request_queue *q, unsigned int alignment)
> +{
> +	q->alignment = alignment & (q->granularity - 1);
> +	clear_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags);
> +}
> +EXPORT_SYMBOL(blk_queue_alignment);

How would low-level drivers know?

> +
>  /*
>   * Returns the minimum that is _not_ zero, unless both are zero.
>   */
>  #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
>  
>  /**
> + * blk_queue_min_io - set minimum request size for the queue
> + * @q:  the request queue for the device
> + * @min_io:  smallest I/O size in bytes
> + *
> + * Description:
> + *   Some devices have an internal block size bigger than the reported
> + *   hardware sector size.  This function can be used to signal the
> + *   smallest I/O the device can perform without incurring a performance
> + *   penalty.
> + */
> +void blk_queue_min_io(struct request_queue *q, unsigned int min_io)
> +{
> +	q->min_io = min_io;
> +}
> +EXPORT_SYMBOL(blk_queue_min_io);
> +
> +/**
> + * blk_queue_opt_io - set optimal request size for the queue
> + * @q:  the request queue for the device
> + * @opt_io:  optimal request size in bytes
> + *
> + * Description:
> + *   Drivers can call this function to set the preferred I/O request
> + *   size for devices that report such a value.
> + */
> +void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io)
> +{
> +	q->opt_io = opt_io;
> +}
> +EXPORT_SYMBOL(blk_queue_opt_io);
> +
> +/**
>   * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
>   * @t:	the stacking driver (top)
>   * @b:  the underlying device (bottom)
> @@ -335,6 +400,68 @@ void blk_queue_stack_limits(struct reque
>  EXPORT_SYMBOL(blk_queue_stack_limits);
>  
>  /**
> + * blk_queue_stack_topology - adjust queue limits for stacked drivers
> + * @t:	the stacking driver (top)
> + * @bdev:  the underlying block device (bottom)
> + * @offset:  offset to beginning of data within component device
> + **/
> +void blk_queue_stack_topology(struct request_queue *t, struct block_device *bdev,
> +			      sector_t offset)
> +{
> +	struct request_queue *b = bdev_get_queue(bdev);
> +	int misaligned;
> +
> +	/* zero is "infinity" */
> +	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
> +	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
> +	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
> +
> +	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
> +	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
> +	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
> +	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
> +	t->min_io = max(t->min_io, b->min_io);
> +	t->granularity = max(t->granularity, b->granularity);
> +
> +	misaligned = 0;
> +	offset += get_start_sect(bdev) << 9;
> +
> +	/* Bottom device offset aligned? */
> +	if (offset && (offset & (b->granularity - 1)) != b->alignment) {
> +		misaligned = 1;
> +		goto out;
> +	}
> +
> +	/* If top has no alignment, inherit from bottom */
> +	if (!t->alignment)
> +		t->alignment = b->alignment & (b->granularity - 1);
> +
> +	/* Top alignment on logical block boundary? */
> +	if (t->alignment & (t->hardsect_size - 1)) {
> +		misaligned = 1;
> +		goto out;
> +	}
> +
> +out:
> +	if (!t->queue_lock)
> +		WARN_ON_ONCE(1);
> +	else if (misaligned || !test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(t->queue_lock, flags);
> +
> +		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
> +			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
> +
> +		if (misaligned)
> +			queue_flag_set(QUEUE_FLAG_MISALIGNED, t);
> +
> +		spin_unlock_irqrestore(t->queue_lock, flags);
> +	}
> +}
> +EXPORT_SYMBOL(blk_queue_stack_topology);
> +
> +/**
>   * blk_queue_dma_pad - set pad mask
>   * @q:     the request queue for the device
>   * @mask:  pad mask
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -105,6 +105,16 @@ static ssize_t queue_hw_sector_size_show
>  	return queue_var_show(q->hardsect_size, page);
>  }
>  
> +static ssize_t queue_min_io_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->min_io, page);
> +}
> +
> +static ssize_t queue_opt_io_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->opt_io, page);
> +}
> +
>  static ssize_t
>  queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
>  {
> @@ -256,6 +266,16 @@ static struct queue_sysfs_entry queue_hw
>  	.show = queue_hw_sector_size_show,
>  };
>  
> +static struct queue_sysfs_entry queue_min_io_entry = {
> +	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
> +	.show = queue_min_io_show,
> +};
> +
> +static struct queue_sysfs_entry queue_opt_io_entry = {
> +	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
> +	.show = queue_opt_io_show,
> +};
> +
>  static struct queue_sysfs_entry queue_nonrot_entry = {
>  	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
>  	.show = queue_nonrot_show,
> @@ -287,6 +307,8 @@ static struct attribute *default_attrs[]
>  	&queue_max_sectors_entry.attr,
>  	&queue_iosched_entry.attr,
>  	&queue_hw_sector_size_entry.attr,
> +	&queue_min_io_entry.attr,
> +	&queue_opt_io_entry.attr,
>  	&queue_nonrot_entry.attr,
>  	&queue_nomerges_entry.attr,
>  	&queue_rq_affinity_entry.attr,
> diff --git a/block/genhd.c b/block/genhd.c
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -848,11 +848,20 @@ static ssize_t disk_capability_show(stru
>  	return sprintf(buf, "%x\n", disk->flags);
>  }
>  
> +static ssize_t disk_alignment_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct gendisk *disk = dev_to_disk(dev);
> +
> +	return sprintf(buf, "%d\n", queue_alignment(disk->queue));
> +}
> +
>  static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
>  static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
>  static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
>  static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
>  static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
> +static DEVICE_ATTR(alignment, S_IRUGO, disk_alignment_show, NULL);
>  static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
>  static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
>  #ifdef CONFIG_FAIL_MAKE_REQUEST
> @@ -871,6 +880,7 @@ static struct attribute *disk_attrs[] = 
>  	&dev_attr_removable.attr,
>  	&dev_attr_ro.attr,
>  	&dev_attr_size.attr,
> +	&dev_attr_alignment.attr,
>  	&dev_attr_capability.attr,
>  	&dev_attr_stat.attr,
>  #ifdef CONFIG_FAIL_MAKE_REQUEST
> diff --git a/fs/partitions/check.c b/fs/partitions/check.c
> --- a/fs/partitions/check.c
> +++ b/fs/partitions/check.c
> @@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *de
>  	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
>  }
>  
> +ssize_t part_alignment_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	struct hd_struct *p = dev_to_part(dev);
> +	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment);
> +}
> +
>  ssize_t part_stat_show(struct device *dev,
>  		       struct device_attribute *attr, char *buf)
>  {
> @@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *d
>  static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
>  static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
>  static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
> +static DEVICE_ATTR(alignment, S_IRUGO, part_alignment_show, NULL);
>  static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
>  #ifdef CONFIG_FAIL_MAKE_REQUEST
>  static struct device_attribute dev_attr_fail =
> @@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = 
>  	&dev_attr_partition.attr,
>  	&dev_attr_start.attr,
>  	&dev_attr_size.attr,
> +	&dev_attr_alignment.attr,
>  	&dev_attr_stat.attr,
>  #ifdef CONFIG_FAIL_MAKE_REQUEST
>  	&dev_attr_fail.attr,
> @@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct g
>  	pdev = part_to_dev(p);
>  
>  	p->start_sect = start;
> +	p->alignment = queue_sector_alignment(disk->queue, start);
>  	p->nr_sects = len;
>  	p->partno = partno;
>  	p->policy = get_disk_ro(disk);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -402,6 +402,10 @@ struct request_queue
>  	unsigned short		max_hw_segments;
>  	unsigned short		hardsect_size;
>  	unsigned int		max_segment_size;
> +	unsigned int		alignment;
> +	unsigned int		granularity;
> +	unsigned int		min_io;
> +	unsigned int		opt_io;
>  
>  	unsigned long		seg_boundary_mask;
>  	void			*dma_drain_buffer;

Patch looks fine, but can we group these by name please? alignment could
be anything.

+	unsigned int		io_alignment;
+	unsigned int		io_granularity;
+	unsigned int		io_min;
+	unsigned int		io_opt;
>  

> @@ -461,6 +465,7 @@ struct request_queue
>  #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
>  #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
>  #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
> +#define QUEUE_FLAG_MISALIGNED  16	/* bdev not aligned to disk */
>  
>  #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>  				 (1 << QUEUE_FLAG_CLUSTER) |		\
> @@ -877,7 +882,15 @@ extern void blk_queue_max_phys_segments(
>  extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
>  extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
>  extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
> +extern void blk_queue_granularity(struct request_queue *, unsigned short);
> +extern void blk_queue_alignment(struct request_queue *q,
> +				unsigned int alignment);
> +extern void blk_queue_min_io(struct request_queue *q, unsigned int min_io);
> +extern void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io);
>  extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
> +extern void blk_queue_stack_topology(struct request_queue *t,
> +				     struct block_device *bdev,
> +				     sector_t offset);
>  extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
>  extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
>  extern int blk_queue_dma_drain(struct request_queue *q,
> @@ -978,6 +991,23 @@ static inline int bdev_hardsect_size(str
>  	return queue_hardsect_size(bdev_get_queue(bdev));
>  }
>  
> +static inline int queue_alignment(struct request_queue *q)
> +{
> +	if (q && test_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags))
> +		return -1;
> +
> +	if (q && q->alignment)
> +		return q->alignment;
> +
> +	return 0;
> +}
> +
> +static inline int queue_sector_alignment(struct request_queue *q,
> +					 sector_t sector)
> +{
> +	return ((sector << 9) - q->alignment) & (q->min_io - 1);
> +}
> +
>  static inline int queue_dma_alignment(struct request_queue *q)
>  {
>  	return q ? q->dma_alignment : 511;
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -90,6 +90,7 @@ struct disk_stats {
>  struct hd_struct {
>  	sector_t start_sect;
>  	sector_t nr_sects;
> +	sector_t alignment;
>  	struct device __dev;
>  	struct kobject *holder_dir;
>  	int policy, partno;
> 
> 

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystems]     [Linux SCSI]     [Linux RAID]     [Git]     [Kernel Newbies]     [Linux Newbie]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Samba]     [Device Mapper]

  Powered by Linux