Queue congestion: passing down vs passing up [PATCH]

Miquel van Smoorenburg <miquels@cistron.nl> · Wed, 25 Feb 2004 15:15:55 +0100

[Resent since I didn't see it show up on the linux-lvm list yet-
 apologies if you get this twice]
[weeded out the enormous Cc: list]

On 2004.02.20 15:57, Jens Axboe wrote:
> On Fri, Feb 20 2004, Miquel van Smoorenburg wrote:
> > > > Even if it isn't happening
> > > > a lot, and something isn't bust it might be a good idea to
> > > > do this.
> > > 
> > > Seems OK from a quick check.  pdflush will block in get_request_wait()
> > > occasionally, but not at all often.  Perhaps we could move the
> > > write_congested test into the mpage_writepages() inner loop but it hardly
> > > seems worth the risk.
> > > 
> > > Maybe things are different on Miquel's clockwork controller.
> > 
> > I haven't tested it yet because of the "This patch isn't actually so good"
> > comment, but I found another explanation.
> > 
> > >  drivers/block/ll_rw_blk.c |    2 ++
> > >  fs/fs-writeback.c         |    2 ++
> > >  2 files changed, 4 insertions(+)
> > 
> > *Lightbulb on* .. I just read fs-writeback.c. As I said, this happens
> > with an LVM device. Could it be that because LVM and the actual device
> > have different struct request_queue's things go awry ?
> > 
> > In fs-writeback.c, your're looking at the LVM device (and its
> > request_queue, and its backing_dev_info). In__make_request, you're
> > looking at the SCSI device.
> 
> In principle, the lvm/md queues themselves will never be congested. But
> the underlying queues can be, of course.
> 
> Now this approach is _much_ better, imo. I don't particularly care very
> much for how you solved it, though, I'd much rather just see both
> setting and testing passed down (and kill the ->aux as well).

I just sent a patch to linux-lvm / Joe Thornber that does this. It adds a
void *request_queue to the struct backing_dev_info, and a congestion_fn
to the request_queue_t struct. bdi_{read,write}_congested got moved to
ll_rw_blk.c. dm.c sets the congestion_fn to an internal function that
checks the congestion of all the underlying block devices.

See https://www.redhat.com/archives/linux-lvm/2004-February/msg00203.html
for the actual patch.

So that's testing being passed down.

The setting you cannot pass down - you need to pass it "up". I implemented that
too, added a list of "parent queues" to the request_queue, and a function so
that for example dm can register its queue as "parent" of an underlying device.

It needs to be a list, since different partitions on one device can be members
of different meta-devices.

When the congestion functions in ll_rw_blk.c are called, if a queue is congested,
that info is also passed "up" to all of the "parent" queues. backing_dev_info
uses a counter instead of a bit to mark read/write as congested.
Perhaps it would be better to call a function (congestion_fn).

I think that we only need one of the two - either testing being passed down
or setting being passed up. The first one is the easiest. The second one
keeps the struct backing_dev_info clean (no request_queue pointer).

What would you prefer ?

For reference, a first cut at the "passing up queue congestion" is below.

Mike.

queue-parent.patch

--- linux-2.6.3.orig/include/linux/backing-dev.h	2004-02-04 04:43:38.000000000 +0100
+++ linux-2.6.3-queue_parent/include/linux/backing-dev.h	2004-02-23 16:17:53.000000000 +0100
@@ -24,6 +24,8 @@
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	int memory_backed;	/* Cannot clean pages with writepage */
+	int read_congested;	/* Write queue or child write queue congested */
+	int write_congested;	/* Read queue or child read queue congested */
 };
 
 extern struct backing_dev_info default_backing_dev_info;
@@ -34,12 +36,12 @@
 
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_read_congested, &bdi->state);
+	return bdi->read_congested;
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_write_congested, &bdi->state);
+	return bdi->write_congested;
 }
 
 #endif		/* _LINUX_BACKING_DEV_H */
--- linux-2.6.3.orig/include/linux/blkdev.h	2004-02-22 13:52:17.000000000 +0100
+++ linux-2.6.3-queue_parent/include/linux/blkdev.h	2004-02-24 15:02:49.000000000 +0100
@@ -267,6 +267,11 @@
 	atomic_t refcnt;		/* map can be shared */
 };
 
+struct rqueue_list {
+	struct list_head	rqueue_list_head;
+	struct request_queue	*rqueue;
+};
+
 struct request_queue
 {
 	/*
@@ -301,6 +306,8 @@
 
 	struct backing_dev_info	backing_dev_info;
 
+	struct list_head	parents;
+
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
@@ -514,6 +521,8 @@
 extern void __blk_stop_queue(request_queue_t *q);
 extern void blk_run_queue(request_queue_t *q);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
+extern int blk_queue_add_parent(request_queue_t *q, request_queue_t *parent);
+extern void blk_queue_remove_parent(request_queue_t *q, request_queue_t *parent);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
--- linux-2.6.3.orig/drivers/block/ll_rw_blk.c	2004-02-04 04:43:10.000000000 +0100
+++ linux-2.6.3-queue_parent/drivers/block/ll_rw_blk.c	2004-02-24 16:50:20.000000000 +0100
@@ -93,9 +93,31 @@
 }
 
 /*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
+ * Increment/decrement the congestion counter of this queue
+ * and recurse over the parent queues.
+ */
+static void set_queues_congested(request_queue_t *q, int rw, int how)
+{
+	struct rqueue_list *p;
+	struct list_head *l;
+
+	if (rw == WRITE)
+		q->backing_dev_info.write_congested += how;
+	else
+		q->backing_dev_info.read_congested += how;
+
+	__list_for_each(l, &q->parents) {
+		p = list_entry(l, struct rqueue_list, rqueue_list_head);
+		spin_lock(p->rqueue->queue_lock);
+		set_queues_congested(p->rqueue, rw, how);
+		spin_unlock(p->rqueue->queue_lock);
+	}
+}
+
+/*
+ * A queue has just exitted congestion. Flag that in the queue's
+ * VM-visible state flags, and wake up anyone who was waiting
+ * for requests to be put back.
  */
 static void clear_queue_congested(request_queue_t *q, int rw)
 {
@@ -103,23 +125,91 @@
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
 	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
-	clear_bit(bit, &q->backing_dev_info.state);
+	if (!test_and_clear_bit(bit, &q->backing_dev_info.state))
+		return;
+	set_queues_congested(q, rw, -1);
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
 
 /*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
+ * A queue has just entered congestion. Flag that in the queue's
+ * VM-visible state flags.
  */
 static void set_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 
 	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
-	set_bit(bit, &q->backing_dev_info.state);
+	if (test_and_set_bit(bit, &q->backing_dev_info.state))
+		return;
+	set_queues_congested(q, rw, 1);
+}
+
+/**
+ * blk_queue_add_parent - add a queue to the list of parents.
+ * @q:		this queue
+ * @parent:	parent queue
+ *
+ * Adds a queue to the list of parents. Locks both queues.
+ */
+int blk_queue_add_parent(request_queue_t *q, request_queue_t *parent)
+{
+	struct rqueue_list	*r;
+
+	r = kmalloc(sizeof(*r), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&r->rqueue_list_head);
+	r->rqueue = parent;
+
+	spin_lock(q->queue_lock);
+	spin_lock(parent->queue_lock);
+	list_add_tail(&r->rqueue_list_head, &q->parents);
+	parent->backing_dev_info.read_congested +=
+		q->backing_dev_info.read_congested;
+	parent->backing_dev_info.write_congested +=
+		q->backing_dev_info.write_congested;
+	spin_unlock(parent->queue_lock);
+	spin_unlock(q->queue_lock);
+
+	return 0;
 }
 
+EXPORT_SYMBOL(blk_queue_add_parent);
+
+/**
+ * blk_queue_remove_parent - remove a queue from the list of parents.
+ * @q:		this queue
+ * @parent:	parent queue
+ *
+ * Removes a queue from the list of parents. Locks both queues.
+ */
+void blk_queue_remove_parent(request_queue_t *q, request_queue_t *parent)
+{
+	struct list_head	*l;
+	struct rqueue_list	*r;
+
+	spin_lock(q->queue_lock);
+	__list_for_each(l, &q->parents) {
+		r = list_entry(l, struct rqueue_list, rqueue_list_head);
+		if (r->rqueue == parent) {
+			spin_lock(parent->queue_lock);
+			parent->backing_dev_info.read_congested -=
+				q->backing_dev_info.read_congested;
+			parent->backing_dev_info.write_congested -=
+				q->backing_dev_info.write_congested;
+			spin_unlock(parent->queue_lock);
+			list_del(l);
+			kfree(r);
+			break;
+		}
+	}
+	spin_unlock(q->queue_lock);
+}
+
+EXPORT_SYMBOL(blk_queue_remove_parent);
+
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
@@ -1372,6 +1462,7 @@
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+	INIT_LIST_HEAD(&q->parents);
 	return q;
 }
 
@@ -2813,8 +2904,11 @@
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
 	struct request_list *rl = &q->rq;
+	int ret;
+
+	spin_lock(q->queue_lock);
 
-	int ret = queue_var_store(&q->nr_requests, page, count);
+	ret = queue_var_store(&q->nr_requests, page, count);
 	if (q->nr_requests < BLKDEV_MIN_RQ)
 		q->nr_requests = BLKDEV_MIN_RQ;
 
@@ -2841,6 +2935,9 @@
 		blk_clear_queue_full(q, WRITE);
 		wake_up(&rl->wait[WRITE]);
 	}
+
+	spin_unlock(q->queue_lock);
+
 	return ret;
 }
 
--- linux-2.6.3.orig/drivers/md/dm.h	2004-02-04 04:43:45.000000000 +0100
+++ linux-2.6.3-queue_parent/drivers/md/dm.h	2004-02-23 21:32:40.000000000 +0100
@@ -110,6 +110,8 @@
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+void dm_table_add_parent(struct dm_table *t, struct request_queue *q);
+void dm_table_remove_parent(struct dm_table *t, struct request_queue *q);
 unsigned int dm_table_get_num_targets(struct dm_table *t);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 int dm_table_get_mode(struct dm_table *t);
--- linux-2.6.3.orig/drivers/md/dm-table.c	2004-02-04 04:44:59.000000000 +0100
+++ linux-2.6.3-queue_parent/drivers/md/dm-table.c	2004-02-24 15:17:06.000000000 +0100
@@ -818,6 +818,30 @@
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
 }
 
+void dm_table_add_parent(struct dm_table *t, struct request_queue *q)
+{
+	struct list_head *d, *devices;
+
+	devices = dm_table_get_devices(t);
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *bq = bdev_get_queue(dd->bdev);
+		blk_queue_add_parent(bq, q);
+	}
+}
+
+void dm_table_remove_parent(struct dm_table *t, struct request_queue *q)
+{
+	struct list_head *d, *devices;
+
+	devices = dm_table_get_devices(t);
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *bq = bdev_get_queue(dd->bdev);
+		blk_queue_remove_parent(bq, q);
+	}
+}
+
 unsigned int dm_table_get_num_targets(struct dm_table *t)
 {
 	return t->num_targets;
--- linux-2.6.3.orig/drivers/md/dm.c	2004-02-22 13:52:15.000000000 +0100
+++ linux-2.6.3-queue_parent/drivers/md/dm.c	2004-02-24 16:35:32.000000000 +0100
@@ -41,6 +41,7 @@
 struct mapped_device {
 	struct rw_semaphore lock;
 	atomic_t holders;
+	spinlock_t queue_lock;
 
 	unsigned long flags;
 
@@ -600,7 +601,7 @@
 	memset(md, 0, sizeof(*md));
 	init_rwsem(&md->lock);
 	atomic_set(&md->holders, 1);
-
+	md->queue_lock = SPIN_LOCK_UNLOCKED;
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue) {
 		kfree(md);
@@ -608,6 +609,7 @@
 	}
 
 	md->queue->queuedata = md;
+	md->queue->queue_lock = &md->queue_lock;
 	blk_queue_make_request(md->queue, dm_request);
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
@@ -695,6 +697,7 @@
 
 	dm_table_get(t);
 	dm_table_set_restrictions(t, q);
+	dm_table_add_parent(t, q);
 	return 0;
 }
 
@@ -704,6 +707,7 @@
 		return;
 
 	dm_table_event_callback(md->map, NULL, NULL);
+	dm_table_remove_parent(md->map, md->queue);
 	dm_table_put(md->map);
 	md->map = NULL;
 }

_______________________________________________
linux-lvm mailing list
linux-lvm@redhat.com
https://www.redhat.com/mailman/listinfo/linux-lvm
read the LVM HOW-TO at http://tldp.org/HOWTO/LVM-HOWTO/