[PATCH 2 of 8] block: Export I/O topology for block devices and partitions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  hardsect_size remains unchanged.  It is the smallest atomic unit the
  device can address (i.e. logical block size).

  granularity indicates the smallest I/O the device can access without
  incurring a read-modify-write penalty.  The granularity is set by
  low-level drivers from then on it is purely internal to the stacking
  logic.

  The min_io parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as granularity.  However,
  the min_io parameter can be scaled up when stacking (RAID5 chunk
  size > physical sector size).  min_io is available in sysfs.

  The opt_io characteristic indicates the preferred I/O size reported by
  the device.  This is usually the stripe width for arrays.  The value
  is in sysfs.

  The alignment parameter indicates the number of bytes the start of the
  device/partition is offset from the device granularity.  Partition
  tools and MD/DM tools can use this to align filesystems to the proper
  boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx>

---
6 files changed, 204 insertions(+), 4 deletions(-)
block/blk-settings.c   |  135 ++++++++++++++++++++++++++++++++++++++++++++++--
block/blk-sysfs.c      |   22 +++++++
block/genhd.c          |   10 +++
fs/partitions/check.c  |   10 +++
include/linux/blkdev.h |   30 ++++++++++
include/linux/genhd.h  |    1 



diff --git a/block/blk-settings.c b/block/blk-settings.c
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -292,22 +292,87 @@ EXPORT_SYMBOL(blk_queue_max_segment_size
  *
  * Description:
  *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
+ *   (logical block size) that the hardware can operate on.  Usually the
+ *   default of 512 covers most hardware.
  **/
 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
 {
-	q->hardsect_size = size;
+	q->hardsect_size = q->granularity = size;
 }
 EXPORT_SYMBOL(blk_queue_hardsect_size);
 
+/**
+ * blk_queue_granularity - set I/O granularity for the queue
+ * @q:  the request queue for the device
+ * @size:  the I/O granularity, in bytes
+ *
+ * Description:
+ *   This should typically be set to the lowest possible sector size
+ *   that the hardware can operate on without reverting to
+ *   read-modify-write operations.
+ **/
+void blk_queue_granularity(struct request_queue *q, unsigned short size)
+{
+	q->granularity = size;
+}
+EXPORT_SYMBOL(blk_queue_granularity);
+
+/**
+ * blk_queue_alignment - set alignment for the queue
+ * @q:  the request queue for the device
+ * @alignment:  alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment(struct request_queue *q, unsigned int alignment)
+{
+	q->alignment = alignment & (q->granularity - 1);
+	clear_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags);
+}
+EXPORT_SYMBOL(blk_queue_alignment);
+
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 
 /**
+ * blk_queue_min_io - set minimum request size for the queue
+ * @q:  the request queue for the device
+ * @min_io:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_queue_min_io(struct request_queue *q, unsigned int min_io)
+{
+	q->min_io = min_io;
+}
+EXPORT_SYMBOL(blk_queue_min_io);
+
+/**
+ * blk_queue_opt_io - set optimal request size for the queue
+ * @q:  the request queue for the device
+ * @opt_io:  optimal request size in bytes
+ *
+ * Description:
+ *   Drivers can call this function to set the preferred I/O request
+ *   size for devices that report such a value.
+ */
+void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io)
+{
+	q->opt_io = opt_io;
+}
+EXPORT_SYMBOL(blk_queue_opt_io);
+
+/**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
  * @b:  the underlying device (bottom)
@@ -335,6 +400,68 @@ void blk_queue_stack_limits(struct reque
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
 /**
+ * blk_queue_stack_topology - adjust queue limits for stacked drivers
+ * @t:	the stacking driver (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ **/
+void blk_queue_stack_topology(struct request_queue *t, struct block_device *bdev,
+			      sector_t offset)
+{
+	struct request_queue *b = bdev_get_queue(bdev);
+	int misaligned;
+
+	/* zero is "infinity" */
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
+
+	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
+	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
+	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
+	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
+	t->min_io = max(t->min_io, b->min_io);
+	t->granularity = max(t->granularity, b->granularity);
+
+	misaligned = 0;
+	offset += get_start_sect(bdev) << 9;
+
+	/* Bottom device offset aligned? */
+	if (offset && (offset & (b->granularity - 1)) != b->alignment) {
+		misaligned = 1;
+		goto out;
+	}
+
+	/* If top has no alignment, inherit from bottom */
+	if (!t->alignment)
+		t->alignment = b->alignment & (b->granularity - 1);
+
+	/* Top alignment on logical block boundary? */
+	if (t->alignment & (t->hardsect_size - 1)) {
+		misaligned = 1;
+		goto out;
+	}
+
+out:
+	if (!t->queue_lock)
+		WARN_ON_ONCE(1);
+	else if (misaligned || !test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(t->queue_lock, flags);
+
+		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+
+		if (misaligned)
+			queue_flag_set(QUEUE_FLAG_MISALIGNED, t);
+
+		spin_unlock_irqrestore(t->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(blk_queue_stack_topology);
+
+/**
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
  * @mask:  pad mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,16 @@ static ssize_t queue_hw_sector_size_show
 	return queue_var_show(q->hardsect_size, page);
 }
 
+static ssize_t queue_min_io_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->min_io, page);
+}
+
+static ssize_t queue_opt_io_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->opt_io, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -256,6 +266,16 @@ static struct queue_sysfs_entry queue_hw
 	.show = queue_hw_sector_size_show,
 };
 
+static struct queue_sysfs_entry queue_min_io_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_min_io_show,
+};
+
+static struct queue_sysfs_entry queue_opt_io_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_opt_io_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -287,6 +307,8 @@ static struct attribute *default_attrs[]
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_min_io_entry.attr,
+	&queue_opt_io_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -848,11 +848,20 @@ static ssize_t disk_capability_show(stru
 	return sprintf(buf, "%x\n", disk->flags);
 }
 
+static ssize_t disk_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", queue_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment, S_IRUGO, disk_alignment_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -871,6 +880,7 @@ static struct attribute *disk_attrs[] = 
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *de
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_alignment_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *d
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment, S_IRUGO, part_alignment_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = 
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct g
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
+	p->alignment = queue_sector_alignment(disk->queue, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -402,6 +402,10 @@ struct request_queue
 	unsigned short		max_hw_segments;
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
+	unsigned int		alignment;
+	unsigned int		granularity;
+	unsigned int		min_io;
+	unsigned int		opt_io;
 
 	unsigned long		seg_boundary_mask;
 	void			*dma_drain_buffer;
@@ -461,6 +465,7 @@ struct request_queue
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
+#define QUEUE_FLAG_MISALIGNED  16	/* bdev not aligned to disk */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -877,7 +882,15 @@ extern void blk_queue_max_phys_segments(
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
+extern void blk_queue_granularity(struct request_queue *, unsigned short);
+extern void blk_queue_alignment(struct request_queue *q,
+				unsigned int alignment);
+extern void blk_queue_min_io(struct request_queue *q, unsigned int min_io);
+extern void blk_queue_opt_io(struct request_queue *q, unsigned int opt_io);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
+extern void blk_queue_stack_topology(struct request_queue *t,
+				     struct block_device *bdev,
+				     sector_t offset);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
@@ -978,6 +991,23 @@ static inline int bdev_hardsect_size(str
 	return queue_hardsect_size(bdev_get_queue(bdev));
 }
 
+static inline int queue_alignment(struct request_queue *q)
+{
+	if (q && test_bit(QUEUE_FLAG_MISALIGNED, &q->queue_flags))
+		return -1;
+
+	if (q && q->alignment)
+		return q->alignment;
+
+	return 0;
+}
+
+static inline int queue_sector_alignment(struct request_queue *q,
+					 sector_t sector)
+{
+	return ((sector << 9) - q->alignment) & (q->min_io - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -90,6 +90,7 @@ struct disk_stats {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
+	sector_t alignment;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;


--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystems]     [Linux SCSI]     [Linux RAID]     [Git]     [Kernel Newbies]     [Linux Newbie]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Samba]     [Device Mapper]

  Powered by Linux