[PATCH] block: Provide I/O alignment statistics

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



As an aid to users with 512e drives and disk arrays we now keep track of
whether submitted I/O requests are aligned to the block sizes contained
in the queue limits. We record whether each request is a properly
aligned multiple of the optimal, minumum, physical and logical block
sizes respectively. There are separate metrics for reads and writes and
the reported device alignment is taken into account. The statistics are
available in block/alignment_stat and block/<part>/alignment_stat. The
output format is documented in Documentation/ABI/testing/sysfs-block.

Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx>

---

Example run unpacking a kernel tarball using XFS on a 512e device.
Partition sdb1 is aligned, sdb2 is misaligned. Output has been
pretty-printed for readability:

RD    TOTAL      LBS      PBS      MIN      OPT   
sdb1:   564      136        0      428        0   
sdb2:   501      501        0        0        0   

WR    TOTAL      LBS      PBS      MIN      OPT
sdb1: 24440       42        0    24225      613
sdb2: 25248    25687        0        1        0

As expected, we were pretty good at issuing big I/O requests on the
aligned partition. The logical block size-aligned requests are from
mkfs.xfs and mount.


diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 0ad8442..747c95b 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -229,3 +229,31 @@ Description:
 		when a discarded area is read the discard_zeroes_data
 		parameter will be set to one. Otherwise it will be 0 and
 		the result of reading a discarded area is undefined.
+
+What:		/sys/block/<disk>/alignment_stat
+Date:		August 2012
+Contact:	Martin K. Petersen <martin.petersen@xxxxxxxxxx>
+Description:
+		The block layer keeps track of whether I/O requests are
+		properly aligned multiples of the block sizes found in
+		the queue limits (optimal, minimum, physical and
+		logical). alignment_stat provides a dump of these
+		statistics for a disk. The file contains 10 fields:
+		 1 - reads: total number of completed requests
+		 2 - reads: properly aligned multiples of logical block size
+		 3 - reads: properly aligned multiples of physical block size
+		 4 - reads: properly aligned multiples of minimum I/O size
+		 5 - reads: properly aligned multiples of optimal I/O size
+		 6 - writes: total number of completed requests
+		 7 - writes: properly aligned multiples of logical block size
+		 8 - writes: properly aligned multiples of physical block size
+		 9 - writes: properly aligned multiples of minimum I/O size
+		10 - writes: properly aligned multiples of optimal I/O size
+
+What:		/sys/block/<disk>/<part>/alignment_stat
+Date:		August 2012
+Contact:	Martin K. Petersen <martin.petersen@xxxxxxxxxx>
+Description:
+		The /sys/block/<disk>/<part>/stat file displays the I/O
+		alignment statistics for partition <part>. The format is
+		the same as /sys/block/<disk>/alignment_stat.
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b4dbdf..fd890d0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1979,13 +1979,30 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
+		struct queue_limits *lim = &req->q->limits;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
-		int cpu;
+		int cpu, alignment;
+		sector_t offset;
 
 		cpu = part_stat_lock();
 		part = req->part;
+		alignment = queue_alignment_offset(req->q);
+		offset = blk_rq_pos(req) << 9;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+
+		if (lim->io_opt && offset % lim->io_opt == alignment &&
+		    bytes % lim->io_opt == 0)
+			part_stat_inc(cpu, part, opt_aligned_ios[rw]);
+		else if (offset % lim->io_min == alignment &&
+			 bytes % lim->io_min == 0)
+			part_stat_inc(cpu, part, min_aligned_ios[rw]);
+		else if (offset % lim->physical_block_size == alignment &&
+			   bytes % lim->physical_block_size == 0)
+			part_stat_inc(cpu, part, pbs_aligned_ios[rw]);
+		else
+			part_stat_inc(cpu, part, lbs_aligned_ios[rw]);
+
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index d839723..7d5819a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -988,6 +988,7 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
 		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(alignment_stat, S_IRUGO, part_alignment_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -1009,6 +1010,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_alignment_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
diff --git a/block/partition-generic.c b/block/partition-generic.c
index f1d1451..35c54af 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -135,6 +135,25 @@ ssize_t part_stat_show(struct device *dev,
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_alignment_stat_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf,
+		       "%8lu %8lu %8lu %8lu %8lu %8lu %8lu %8lu %8lu %8lu\n",
+		       part_stat_read(p, ios[READ]),
+		       part_stat_read(p, lbs_aligned_ios[READ]),
+		       part_stat_read(p, pbs_aligned_ios[READ]),
+		       part_stat_read(p, min_aligned_ios[READ]),
+		       part_stat_read(p, opt_aligned_ios[READ]),
+		       part_stat_read(p, ios[WRITE]),
+		       part_stat_read(p, lbs_aligned_ios[WRITE]),
+		       part_stat_read(p, pbs_aligned_ios[WRITE]),
+		       part_stat_read(p, min_aligned_ios[WRITE]),
+		       part_stat_read(p, opt_aligned_ios[WRITE]));
+}
+
 ssize_t part_inflight_show(struct device *dev,
 			struct device_attribute *attr, char *buf)
 {
@@ -175,6 +194,7 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
 		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(alignment_stat, S_IRUGO, part_alignment_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -189,6 +209,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_alignment_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4f440b3..9f158be 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -85,6 +85,10 @@ struct disk_stats {
 	unsigned long ticks[2];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+	unsigned long lbs_aligned_ios[2];
+	unsigned long pbs_aligned_ios[2];
+	unsigned long min_aligned_ios[2];
+	unsigned long opt_aligned_ios[2];
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
@@ -620,6 +624,8 @@ extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
 extern ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+extern ssize_t part_alignment_stat_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
 extern ssize_t part_inflight_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux