On Thu, Apr 23 2009, Martin K. Petersen wrote: > To support devices with physical block sizes bigger than 512 bytes we > need to ensure proper alignment. This patch adds support for exposing > I/O topology characteristics as devices are stacked. > > hardsect_size remains unchanged. It is the smallest atomic unit the > device can address (i.e. logical block size). > > granularity indicates the smallest I/O the device can access without > incurring a read-modify-write penalty. The granularity is set by > low-level drivers from then on it is purely internal to the stacking > logic. > > The min_io parameter is the smallest preferred I/O size reported by > the device. In many cases this is the same as granularity. However, > the min_io parameter can be scaled up when stacking (RAID5 chunk > size > physical sector size). min_io is available in sysfs. > > The opt_io characteristic indicates the preferred I/O size reported by > the device. This is usually the stripe width for arrays. The value > is in sysfs. > > The alignment parameter indicates the number of bytes the start of the > device/partition is offset from the device granularity. Partition > tools and MD/DM tools can use this to align filesystems to the proper > boundaries. > > Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx> > > --- > 6 files changed, 204 insertions(+), 4 deletions(-) > block/blk-settings.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++-- > block/blk-sysfs.c | 22 +++++++ > block/genhd.c | 10 +++ > fs/partitions/check.c | 10 +++ > include/linux/blkdev.h | 30 ++++++++++ > include/linux/genhd.h | 1 > > > > diff --git a/block/blk-settings.c b/block/blk-settings.c > --- a/block/blk-settings.c > +++ b/block/blk-settings.c > @@ -292,22 +292,87 @@ EXPORT_SYMBOL(blk_queue_max_segment_size > * > * Description: > * This should typically be set to the lowest possible sector size > - * that the hardware can operate on (possible without reverting to > - * even internal read-modify-write operations). Usually the default > - * of 512 covers most hardware. > + * (logical block size) that the hardware can operate on. Usually the > + * default of 512 covers most hardware. > **/ > void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) > { > - q->hardsect_size = size; > + q->hardsect_size = q->granularity = size; > } > EXPORT_SYMBOL(blk_queue_hardsect_size); > > +/** > + * blk_queue_granularity - set I/O granularity for the queue > + * @q: the request queue for the device > + * @size: the I/O granularity, in bytes > + * > + * Description: > + * This should typically be set to the lowest possible sector size > + * that the hardware can operate on without reverting to > + * read-modify-write operations. > + **/ > +void blk_queue_granularity(struct request_queue *q, unsigned short size) > +{ > + q->granularity = size; > +} > +EXPORT_SYMBOL(blk_queue_granularity); > + > +/** > + * blk_queue_alignment - set alignment for the queue > + * @q: the request queue for the device > + * @alignment: alignment offset in bytes > + * > + * Description: > + * Some devices are naturally misaligned to compensate for things like > + * the legacy DOS partition table 63-sector offset. Low-level drivers > + * should call this function for devices whose first sector is not > + * naturally aligned. > + */ > +void blk_queue_alignment(struct request_queue *q, unsigned int alignment) > +{ > + q->alignment = alignment & (q->granularity - 1); > + clear_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags); > +} > +EXPORT_SYMBOL(blk_queue_alignment); How would low-level drivers know? > + > /* > * Returns the minimum that is _not_ zero, unless both are zero. > */ > #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) > > /** > + * blk_queue_min_io - set minimum request size for the queue > + * @q: the request queue for the device > + * @min_io: smallest I/O size in bytes > + * > + * Description: > + * Some devices have an internal block size bigger than the reported > + * hardware sector size. This function can be used to signal the > + * smallest I/O the device can perform without incurring a performance > + * penalty. > + */ > +void blk_queue_min_io(struct request_queue *q, unsigned int min_io) > +{ > + q->min_io = min_io; > +} > +EXPORT_SYMBOL(blk_queue_min_io); > + > +/** > + * blk_queue_opt_io - set optimal request size for the queue > + * @q: the request queue for the device > + * @opt_io: optimal request size in bytes > + * > + * Description: > + * Drivers can call this function to set the preferred I/O request > + * size for devices that report such a value. > + */ > +void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io) > +{ > + q->opt_io = opt_io; > +} > +EXPORT_SYMBOL(blk_queue_opt_io); > + > +/** > * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers > * @t: the stacking driver (top) > * @b: the underlying device (bottom) > @@ -335,6 +400,68 @@ void blk_queue_stack_limits(struct reque > EXPORT_SYMBOL(blk_queue_stack_limits); > > /** > + * blk_queue_stack_topology - adjust queue limits for stacked drivers > + * @t: the stacking driver (top) > + * @bdev: the underlying block device (bottom) > + * @offset: offset to beginning of data within component device > + **/ > +void blk_queue_stack_topology(struct request_queue *t, struct block_device *bdev, > + sector_t offset) > +{ > + struct request_queue *b = bdev_get_queue(bdev); > + int misaligned; > + > + /* zero is "infinity" */ > + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); > + t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); > + t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); > + > + t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments); > + t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments); > + t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); > + t->hardsect_size = max(t->hardsect_size, b->hardsect_size); > + t->min_io = max(t->min_io, b->min_io); > + t->granularity = max(t->granularity, b->granularity); > + > + misaligned = 0; > + offset += get_start_sect(bdev) << 9; > + > + /* Bottom device offset aligned? */ > + if (offset && (offset & (b->granularity - 1)) != b->alignment) { > + misaligned = 1; > + goto out; > + } > + > + /* If top has no alignment, inherit from bottom */ > + if (!t->alignment) > + t->alignment = b->alignment & (b->granularity - 1); > + > + /* Top alignment on logical block boundary? */ > + if (t->alignment & (t->hardsect_size - 1)) { > + misaligned = 1; > + goto out; > + } > + > +out: > + if (!t->queue_lock) > + WARN_ON_ONCE(1); > + else if (misaligned || !test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { > + unsigned long flags; > + > + spin_lock_irqsave(t->queue_lock, flags); > + > + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) > + queue_flag_clear(QUEUE_FLAG_CLUSTER, t); > + > + if (misaligned) > + queue_flag_set(QUEUE_FLAG_MISALIGNED, t); > + > + spin_unlock_irqrestore(t->queue_lock, flags); > + } > +} > +EXPORT_SYMBOL(blk_queue_stack_topology); > + > +/** > * blk_queue_dma_pad - set pad mask > * @q: the request queue for the device > * @mask: pad mask > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -105,6 +105,16 @@ static ssize_t queue_hw_sector_size_show > return queue_var_show(q->hardsect_size, page); > } > > +static ssize_t queue_min_io_show(struct request_queue *q, char *page) > +{ > + return queue_var_show(q->min_io, page); > +} > + > +static ssize_t queue_opt_io_show(struct request_queue *q, char *page) > +{ > + return queue_var_show(q->opt_io, page); > +} > + > static ssize_t > queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) > { > @@ -256,6 +266,16 @@ static struct queue_sysfs_entry queue_hw > .show = queue_hw_sector_size_show, > }; > > +static struct queue_sysfs_entry queue_min_io_entry = { > + .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, > + .show = queue_min_io_show, > +}; > + > +static struct queue_sysfs_entry queue_opt_io_entry = { > + .attr = {.name = "optimal_io_size", .mode = S_IRUGO }, > + .show = queue_opt_io_show, > +}; > + > static struct queue_sysfs_entry queue_nonrot_entry = { > .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, > .show = queue_nonrot_show, > @@ -287,6 +307,8 @@ static struct attribute *default_attrs[] > &queue_max_sectors_entry.attr, > &queue_iosched_entry.attr, > &queue_hw_sector_size_entry.attr, > + &queue_min_io_entry.attr, > + &queue_opt_io_entry.attr, > &queue_nonrot_entry.attr, > &queue_nomerges_entry.attr, > &queue_rq_affinity_entry.attr, > diff --git a/block/genhd.c b/block/genhd.c > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -848,11 +848,20 @@ static ssize_t disk_capability_show(stru > return sprintf(buf, "%x\n", disk->flags); > } > > +static ssize_t disk_alignment_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct gendisk *disk = dev_to_disk(dev); > + > + return sprintf(buf, "%d\n", queue_alignment(disk->queue)); > +} > + > static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); > static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); > static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); > static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); > static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); > +static DEVICE_ATTR(alignment, S_IRUGO, disk_alignment_show, NULL); > static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); > static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); > #ifdef CONFIG_FAIL_MAKE_REQUEST > @@ -871,6 +880,7 @@ static struct attribute *disk_attrs[] = > &dev_attr_removable.attr, > &dev_attr_ro.attr, > &dev_attr_size.attr, > + &dev_attr_alignment.attr, > &dev_attr_capability.attr, > &dev_attr_stat.attr, > #ifdef CONFIG_FAIL_MAKE_REQUEST > diff --git a/fs/partitions/check.c b/fs/partitions/check.c > --- a/fs/partitions/check.c > +++ b/fs/partitions/check.c > @@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *de > return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); > } > > +ssize_t part_alignment_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct hd_struct *p = dev_to_part(dev); > + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment); > +} > + > ssize_t part_stat_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *d > static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); > static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); > static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); > +static DEVICE_ATTR(alignment, S_IRUGO, part_alignment_show, NULL); > static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); > #ifdef CONFIG_FAIL_MAKE_REQUEST > static struct device_attribute dev_attr_fail = > @@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = > &dev_attr_partition.attr, > &dev_attr_start.attr, > &dev_attr_size.attr, > + &dev_attr_alignment.attr, > &dev_attr_stat.attr, > #ifdef CONFIG_FAIL_MAKE_REQUEST > &dev_attr_fail.attr, > @@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct g > pdev = part_to_dev(p); > > p->start_sect = start; > + p->alignment = queue_sector_alignment(disk->queue, start); > p->nr_sects = len; > p->partno = partno; > p->policy = get_disk_ro(disk); > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -402,6 +402,10 @@ struct request_queue > unsigned short max_hw_segments; > unsigned short hardsect_size; > unsigned int max_segment_size; > + unsigned int alignment; > + unsigned int granularity; > + unsigned int min_io; > + unsigned int opt_io; > > unsigned long seg_boundary_mask; > void *dma_drain_buffer; Patch looks fine, but can we group these by name please? alignment could be anything. + unsigned int io_alignment; + unsigned int io_granularity; + unsigned int io_min; + unsigned int io_opt; > > @@ -461,6 +465,7 @@ struct request_queue > #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ > #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ > #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ > +#define QUEUE_FLAG_MISALIGNED 16 /* bdev not aligned to disk */ > > #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ > (1 << QUEUE_FLAG_CLUSTER) | \ > @@ -877,7 +882,15 @@ extern void blk_queue_max_phys_segments( > extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); > extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); > extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); > +extern void blk_queue_granularity(struct request_queue *, unsigned short); > +extern void blk_queue_alignment(struct request_queue *q, > + unsigned int alignment); > +extern void blk_queue_min_io(struct request_queue *q, unsigned int min_io); > +extern void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io); > extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); > +extern void blk_queue_stack_topology(struct request_queue *t, > + struct block_device *bdev, > + sector_t offset); > extern void blk_queue_dma_pad(struct request_queue *, unsigned int); > extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); > extern int blk_queue_dma_drain(struct request_queue *q, > @@ -978,6 +991,23 @@ static inline int bdev_hardsect_size(str > return queue_hardsect_size(bdev_get_queue(bdev)); > } > > +static inline int queue_alignment(struct request_queue *q) > +{ > + if (q && test_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags)) > + return -1; > + > + if (q && q->alignment) > + return q->alignment; > + > + return 0; > +} > + > +static inline int queue_sector_alignment(struct request_queue *q, > + sector_t sector) > +{ > + return ((sector << 9) - q->alignment) & (q->min_io - 1); > +} > + > static inline int queue_dma_alignment(struct request_queue *q) > { > return q ? q->dma_alignment : 511; > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -90,6 +90,7 @@ struct disk_stats { > struct hd_struct { > sector_t start_sect; > sector_t nr_sects; > + sector_t alignment; > struct device __dev; > struct kobject *holder_dir; > int policy, partno; > > -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html