On 12/20/18 11:21 AM, Jens Axboe wrote: > On 12/20/18 11:01 AM, Bart Van Assche wrote: >> On Thu, 2018-12-20 at 06:07 -0700, Jens Axboe wrote: >>> On 12/20/18 6:02 AM, Jens Axboe wrote: >>>>> I'm afraid this cannot work. >>>>> >>>>> The 'tags' here could be the hctx->sched_tags, but what we need to >>>>> clear is hctx->tags->rqs[]. >>>> >>>> You are right, of course, a bit too quick on the trigger. This one >>>> should work better, and also avoids that silly quadratic loop. I don't >>>> think we need the tag == -1 check, but probably best to be safe. >>> >>> Sent out the wrong one, here it is. Bart, if you can reproduce, can you >>> give it a whirl? >>> >>> >>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>> index 2de972857496..fc04bb648f18 100644 >>> --- a/block/blk-mq.c >>> +++ b/block/blk-mq.c >>> @@ -2025,7 +2025,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, >>> { >>> struct page *page; >>> >>> - if (tags->rqs && set->ops->exit_request) { >>> + if (tags->rqs) { >>> int i; >>> >>> for (i = 0; i < tags->nr_tags; i++) { >>> @@ -2033,8 +2033,14 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, >>> >>> if (!rq) >>> continue; >>> - set->ops->exit_request(set, rq, hctx_idx); >>> + if (set->ops->exit_request) >>> + set->ops->exit_request(set, rq, hctx_idx); >>> tags->static_rqs[i] = NULL; >>> + >>> + if (rq->tag == -1) >>> + continue; >>> + if (set->tags[hctx_idx]->rqs[rq->tag] == rq) >>> + set->tags[hctx_idx]->rqs[rq->tag] = NULL; >>> } >>> } >>> >>> @@ -2113,6 +2119,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, >>> return ret; >>> } >>> >>> + rq->tag = -1; >>> WRITE_ONCE(rq->state, MQ_RQ_IDLE); >>> return 0; >>> } >> >> Hi Jens, >> >> Are you sure this is sufficient? > > No, I don't think it is. > >> My understanding is that >> blk_mq_queue_tag_busy_iter() iterates over all tags in the tag set. So if the >> request queue on which part_in_flight() is called and the request queue for >> which blk_mq_free_rqs() is called share their tag set then part_in_flight() >> and blk_mq_free_rqs() can run concurrently. That can cause ugly race >> conditions. Do you think it would be a good idea to modify the inflight >> accounting code such that it only considers the requests of a single request >> queue instead of all requests for a given tag set? > > That would of course solve it, the question is how to do it. One option > would be to have ->rqs[] be: > > struct rq_entry { > struct request_queue *q; > struct request *rq; > }; > > instead of just a request, since then you could check the queue without > having to dereference the request. The current race is inherent in that > we set ->rqs[] AFTER having acquired the tag, so there's a window where > you could find a stale entry. That's not normally an issue since > requests are persistent, but for shared tag maps and queues disappearing > it can pose a problem. Something like this, totally untested. If the queue matches, we know it's safe to dereference it. A safer API to export as well... diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2089c6c62f44..78192b544fa2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; + if (tags->rqs[bitnr].queue != hctx->queue) + return true; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue) + rq = tags->rqs[bitnr].rq; + if (rq) return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -268,6 +270,7 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, struct bt_tags_iter_data { struct blk_mq_tags *tags; + struct blk_mq_hw_ctx *hctx; busy_tag_iter_fn *fn; void *data; bool reserved; @@ -287,7 +290,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assining ->rqs[]. */ - rq = tags->rqs[bitnr]; + rq = tags->rqs[bitnr].rq; if (rq && blk_mq_request_started(rq)) return iter_data->fn(rq, iter_data->data, reserved); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..bb84d1f099c7 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -4,6 +4,11 @@ #include "blk-mq.h" +struct rq_tag_entry { + struct request_queue *queue; + struct request *rq; +}; + /* * Tag address space map. */ @@ -16,7 +21,7 @@ struct blk_mq_tags { struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; - struct request **rqs; + struct rq_tag_entry *rqs; struct request **static_rqs; struct list_head page_list; }; @@ -78,7 +83,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag, struct request *rq) { - hctx->tags->rqs[tag] = rq; + hctx->tags->rqs[tag].queue = hctx->queue; + hctx->tags->rqs[tag].rq = rq; } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 3ba37b9e15e9..4f194946dbd9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct blk_mq_hw_ctx *hctx = data->hctx; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + atomic_inc(&hctx->nr_active); } rq->tag = tag; rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } /* csd/requeue_work/fifo_time is initialized before use */ @@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; + prefetch(tags->rqs[tag].rq); + return tags->rqs[tag].rq; } return NULL; @@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { /* - * If we find a request that is inflight and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * We're only called here if the queue matches. If the rq state is + * inflight, we know the queue is busy. Return false to stop the + * iteration. */ - if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + if (rq->state == MQ_RQ_IN_FLIGHT) { bool *busy = priv; *busy = true; @@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq) shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { + struct blk_mq_hw_ctx *hctx = data.hctx; + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } - data.hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } done: @@ -2069,7 +2076,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (!tags) return NULL; - tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), + tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { -- Jens Axboe