Hi, Shaohua I found it indeed doesn't do front merge when two threads flush plug list concurrently. To reappear , I prepared two IO threads , named a0.io and a1.io . Thread a1.io uses libaio to write 5 requests : sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8 Thread a0.io uses libaio to write other 5 requests : sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8 To meet the condition that thread a1.io flush all its requests to queuee before thread a0.io does, I delay thread a1.io to run queue after it adds all pluged requests to queue, seeing bellow patch: --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3324,6 +3324,21 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) /* * This drops the queue lock */ + + if (strcmp(current->comm, "a1.io") == 0) + { + spin_unlock_irq(q->queue_lock); + mdelay(2000); + spin_lock_irq(q->queue_lock); + } Then I used " ./a1.io & sleep 1 && ./a0.io " to dispatch the IO operation and used blktrace to trace the IO processing, the trace result was showed bellow: 8,16 1 174 176.733880160 1217 Q WS 16 + 8 [a1.io] 8,16 1 175 176.733885300 1217 G WS 16 + 8 [a1.io] 8,16 1 177 176.733890820 1217 Q WS 40 + 8 [a1.io] 8,16 1 178 176.733892460 1217 G WS 40 + 8 [a1.io] 8,16 1 179 176.733896380 1217 Q WS 64 + 8 [a1.io] 8,16 1 180 176.733897900 1217 G WS 64 + 8 [a1.io] 8,16 1 181 176.733901260 1217 Q WS 88 + 8 [a1.io] 8,16 1 182 176.733902760 1217 G WS 88 + 8 [a1.io] 8,16 1 183 176.733906200 1217 Q WS 112 + 8 [a1.io] 8,16 1 184 176.733907640 1217 G WS 112 + 8 [a1.io] 8,16 1 185 176.733909560 1217 I WS 16 + 8 [a1.io] 8,16 1 186 176.733925480 1217 I WS 40 + 8 [a1.io] 8,16 1 187 176.733936560 1217 I WS 64 + 8 [a1.io] 8,16 1 188 176.733945920 1217 I WS 88 + 8 [a1.io] 8,16 1 189 176.733953880 1217 I WS 112 + 8 [a1.io] 8,16 2 103 176.734317080 1218 Q WS 8 + 8 [a0.io] 8,16 2 104 176.734320600 1218 G WS 8 + 8 [a0.io] 8,16 2 106 176.734325520 1218 Q WS 32 + 8 [a0.io] 8,16 2 107 176.734327140 1218 G WS 32 + 8 [a0.io] 8,16 2 108 176.734330620 1218 Q WS 56 + 8 [a0.io] 8,16 2 109 176.734332140 1218 G WS 56 + 8 [a0.io] 8,16 2 110 176.734335380 1218 Q WS 80 + 8 [a0.io] 8,16 2 111 176.734336760 1218 G WS 80 + 8 [a0.io] 8,16 2 112 176.734340300 1218 Q WS 104 + 8 [a0.io] 8,16 2 113 176.734341900 1218 G WS 104 + 8 [a0.io] 8,16 2 114 176.734343520 1218 I WS 8 + 8 [a0.io] 8,16 2 115 176.734357280 1218 I WS 32 + 8 [a0.io] 8,16 2 116 176.734366560 1218 I WS 56 + 8 [a0.io] 8,16 2 117 176.734374800 1218 I WS 80 + 8 [a0.io] 8,16 2 118 176.734382460 1218 I WS 104 + 8 [a0.io] 8,16 2 120 176.738012960 1218 D WS 112 + 8 [a0.io] 8,16 0 151 176.740923200 3 C WS 112 + 8 [0] 8,16 2 121 176.741804960 1218 D WS 88 + 8 [a0.io] 8,16 2 122 176.741826060 1218 R WS 88 + 8 [0] 8,16 2 123 176.741827740 1218 I WS 88 + 8 [a0.io] 8,16 0 152 176.745259080 3 D WS 88 + 8 [ksoftirqd/0] 8,16 0 153 176.747824380 3 D WS 64 + 8 [ksoftirqd/0] 8,16 0 154 176.748921220 3 C WS 88 + 8 [0] 8,16 0 155 176.751472640 3 D WS 40 + 8 [ksoftirqd/0] 8,16 0 156 176.753172500 3 C WS 64 + 8 [0] 8,16 0 157 176.755720740 3 D WS 16 + 8 [ksoftirqd/0] 8,16 0 158 176.757920780 3 C WS 40 + 8 [0] 8,16 0 159 176.760471320 3 D WS 104 + 8 [ksoftirqd/0] 8,16 0 160 176.763172180 3 C WS 16 + 8 [0] 8,16 0 161 176.765717560 3 D WS 80 + 8 [ksoftirqd/0] 8,16 0 162 176.766795400 3 C WS 104 + 8 [0] 8,16 0 163 176.769341020 3 D WS 56 + 8 [ksoftirqd/0] 8,16 2 124 176.770926380 18 C WS 80 + 8 [0] 8,16 0 164 176.774335640 924 C WS 56 + 8 [0] 8,16 2 125 176.774642140 18 D WS 32 + 8 [ksoftirqd/2] 8,16 2 126 176.774651300 18 R WS 32 + 8 [0] 8,16 2 127 176.774652900 18 I WS 32 + 8 [ksoftirqd/2] 8,16 0 165 176.778546360 924 D WS 32 + 8 [sshd] 8,16 0 166 176.782498120 924 D WS 8 + 8 [sshd] 8,16 0 167 176.785298980 3 C WS 32 + 8 [0] 8,16 0 168 176.788671240 3 C WS 8 + 8 [0] We can see from the result that the adjacent requests didn't get merged. To make those adjacent requests to get merged, I try to write a simple patch: diff --git a/block/elevator.c b/block/elevator.c index c3555c9c6..40acd0e84 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -245,6 +245,7 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); + hash_del(&rq->front_hash); rq->cmd_flags &= ~REQ_HASHED; } @@ -252,6 +253,7 @@ static void elv_rqhash_del(struct request_queue *q, struct request *rq) { if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); + } static void elv_rqhash_add(struct request_queue *q, struct request *rq) @@ -260,6 +262,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); + hash_add(e->front_hash, &rq->front_hash, blk_rq_pos(rq)); rq->cmd_flags |= REQ_HASHED; } @@ -290,6 +293,28 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) return NULL; } +static struct request *elv_fronthash_find(struct request_queue *q, sector_t offset) +{ + struct elevator_queue *e = q->elevator; + struct hlist_node *next; + struct request *rq; + + hash_for_each_possible_safe(e->front_hash, rq, next, front_hash, offset) { + BUG_ON(!ELV_ON_HASH(rq)); + + if (unlikely(!rq_mergeable(rq))) { + __elv_rqhash_del(rq); + continue; + } + + if (blk_rq_pos(rq) == offset) + return rq; + } + + return NULL; +} + + /* * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. @@ -493,6 +518,39 @@ static bool elv_attempt_insert_merge(struct request_queue *q, return ret; } +/* + * Attempt to do an insertion front merge. + * Returns true if we merged, false otherwise + */ +static bool elv_attempt_front_merge(struct request_queue *q, + struct request *rq) +{ + struct request *__rq; + bool ret; + + if (blk_queue_nomerges(q)) + return false; + + /* + * First try one-hit cache. + */ + if (q->last_merge && blk_attempt_req_merge(q, rq, q->last_merge)) + return true; + + if (blk_queue_noxmerges(q)) + return false; + + ret = false; + /* + * See if our hash lookup can find a potential frontmerge. + */ + + __rq = elv_fronthash_find(q, rq_hash_key(rq)); + if (__rq && blk_attempt_req_merge(q, rq, __rq)) { + ret = true; + } + return ret; +} void elv_merged_request(struct request_queue *q, struct request *rq, int type) { @@ -657,6 +715,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) * elevator_add_req_fn. */ q->elevator->type->ops.elevator_add_req_fn(q, rq); + /* do a front merge after we added it to elevator */ + elv_attempt_front_merge(q, rq); break; case ELEVATOR_INSERT_FLUSH: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 085a03f67..9fe25be1b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,6 +120,8 @@ struct request { struct list_head ipi_list; }; + struct hlist_node front_hash; /* front merge hash */ + /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 638b324f0..1567e5412 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -114,6 +114,7 @@ struct elevator_queue struct mutex sysfs_lock; unsigned int registered:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); + DECLARE_HASHTABLE(front_hash, ELV_HASH_BITS); }; /* With the patch I restart "./a1.io & sleep 1 && ./a0.io" and blktrace result showed bellow: 8,16 2 249 471.608930240 1473 Q WS 16 + 8 [a1.io] 8,16 2 250 471.608934940 1473 G WS 16 + 8 [a1.io] 8,16 2 252 471.608940680 1473 Q WS 40 + 8 [a1.io] 8,16 2 253 471.608942840 1473 G WS 40 + 8 [a1.io] 8,16 2 254 471.608946500 1473 Q WS 64 + 8 [a1.io] 8,16 2 255 471.608948040 1473 G WS 64 + 8 [a1.io] 8,16 2 256 471.608951460 1473 Q WS 88 + 8 [a1.io] 8,16 2 257 471.608952960 1473 G WS 88 + 8 [a1.io] 8,16 2 258 471.608956340 1473 Q WS 112 + 8 [a1.io] 8,16 2 259 471.608957780 1473 G WS 112 + 8 [a1.io] 8,16 2 260 471.608959880 1473 I WS 16 + 8 [a1.io] 8,16 2 261 471.608975880 1473 I WS 40 + 8 [a1.io] 8,16 2 262 471.608986420 1473 I WS 64 + 8 [a1.io] 8,16 2 263 471.608995480 1473 I WS 88 + 8 [a1.io] 8,16 2 264 471.609003720 1473 I WS 112 + 8 [a1.io] 8,16 3 267 471.610200680 1474 Q WS 8 + 8 [a0.io] 8,16 3 268 471.610204700 1474 G WS 8 + 8 [a0.io] 8,16 3 270 471.610210280 1474 Q WS 32 + 8 [a0.io] 8,16 3 271 471.610211840 1474 G WS 32 + 8 [a0.io] 8,16 3 272 471.610215660 1474 Q WS 56 + 8 [a0.io] 8,16 3 273 471.610217200 1474 G WS 56 + 8 [a0.io] 8,16 3 274 471.610220860 1474 Q WS 80 + 8 [a0.io] 8,16 3 275 471.610222300 1474 G WS 80 + 8 [a0.io] 8,16 3 276 471.610225720 1474 Q WS 104 + 8 [a0.io] 8,16 3 277 471.610227540 1474 G WS 104 + 8 [a0.io] 8,16 3 278 471.610229100 1474 I WS 8 + 8 [a0.io] 8,16 3 279 471.615291420 1474 I WS 32 + 8 [a0.io] 8,16 3 280 471.620311200 1474 I WS 56 + 8 [a0.io] 8,16 3 281 471.625327620 1474 I WS 80 + 8 [a0.io] 8,16 3 282 471.630343080 1474 I WS 104 + 8 [a0.io] 8,16 3 284 471.637881600 1474 D WS 104 + 16 [a0.io] 8,16 3 285 471.640429120 1474 D WS 80 + 16 [a0.io] 8,16 0 397 471.644573100 3 C WS 104 + 16 [0] 8,16 0 398 471.647159640 3 D WS 56 + 16 [ksoftirqd/0] 8,16 1 49 471.649825940 13 C WS 80 + 16 [0] 8,16 1 50 471.652391540 13 D WS 32 + 16 [ksoftirqd/1] 8,16 0 399 471.654446580 3 C WS 56 + 16 [0] 8,16 0 400 471.657000820 3 D WS 8 + 16 [ksoftirqd/0] 8,16 0 401 471.659445000 3 C WS 32 + 16 [0] 8,16 0 402 471.663946360 3 C WS 8 + 16 [0] Now those adjacent request get merged. ------------------ Original ------------------ From: "Zhengyuan Liu"<liuzhengyuan@xxxxxxxxxx>; Date: Fri, Apr 13, 2018 04:16 PM To: "shli"<shli@xxxxxx>; Subject: question about request merge Hi, Shaohua There is a question around me while viewing block layer code, so I'm writing to you to get help. When a request from plug list get flushed to queue, it try to merge into a existing request by calling function elv_attempt_insert_merge before added directly to the queue, seeing bellow: /* * Attempt to do an insertion back merge. Only check for the case where * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * * Returns true if we merged, false otherwise */ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) { struct request *__rq; bool ret; if (blk_queue_nomerges(q)) return false; /* * First try one-hit cache. */ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) return true; if (blk_queue_noxmerges(q)) return false; ret = false; /* * See if our hash lookup can find a potential backmerge. */ while (1) { __rq = elv_rqhash_find(q, blk_rq_pos(rq)); if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; /* The merged request could be merged with others, try again */ ret = true; rq = __rq; } return ret; } >From the comment and code we can see it only do backmerge, I think its enough when there is only one thread doing IO operation since we have sorted the requests in plug list before unplug, seeing the patch from Jianpeng Ma 422765c("block: Remove should_sort judgement when flush blk_plug") and 975927b("block: Add blk_rq_pos(rq) to sort rq when plushing"). However when comes to multiple threads, lets image bellow IO scenario, thread A and B both hold three requests in plug list. threadA: a+1, a+4, a+7 threadB: a+2, a+5, a+8 if threadA has flushed all its request to queue before threadB does, then request a+1 and a+2 and other adjacent pairs have the chance to get merged through backmerge, but if threadB flushs all its requests to queue before threadA how could those adjacent requests get merged? or it doesn't matter at all? I know your patch bee0393("block: recursive merge requests") has solved a multi-thread merge problem but not mines. Its hard to simulate such a IO scenario from userspace and I cann't fingure out other useful methods to verify whether the requests in such IO scenario really cann't get merged. All my thoughts are from speculation, so I hope you could give me some answers. Any reply would be thankful. Zhengyuan