On 2021/08/27 12:13, Jens Axboe wrote: > On 8/26/21 8:48 PM, Bart Van Assche wrote: >> On 8/26/21 5:05 PM, Jens Axboe wrote: >>> On 8/26/21 6:03 PM, Bart Van Assche wrote: >>>> Here is an overview of the tests I ran so far, all on the same test >>>> setup: >>>> * No I/O scheduler: about 5630 K IOPS. >>>> * Kernel v5.11 + mq-deadline: about 1100 K IOPS. >>>> * block-for-next + mq-deadline: about 760 K IOPS. >>>> * block-for-next with improved mq-deadline performance: about 970 K IOPS. >>> >>> So we're still off by about 12%, I don't think that is good enough. >>> That's assuming that v5.11 + mq-deadline is the same as for-next with >>> the mq-deadline change reverted? Because that would be the key number to >>> compare it with. >> >> With the patch series that is available at >> https://github.com/bvanassche/linux/tree/block-for-next the same test reports >> 1090 K IOPS or only 1% below the v5.11 result. I will post that series on the >> linux-block mailing list after I have finished testing that series. > > OK sounds good. I do think we should just do the revert at this point, > any real fix is going to end up being bigger than I'd like at this > point. Then we can re-introduce the feature once we're happy with the > results. FYI, here is what I get with Bart's test script running on a dual socket 8-cores/16-threads Xeon machine (32 CPUs total): * 5.14.0-rc7, with fb926032b320 reverted: ----------------------------------------- QD 1: IOPS=305k (*) QD 2: IOPS=411k QD 4: IOPS=408k QD 8: IOPS=414k * 5.14.0-rc7, current (no modification): ---------------------------------------- QD 1: IOPS=296k (*) QD 2: IOPS=207k QD 4: IOPS=208k QD 8: IOPS=210k * 5.14.0-rc7, with modified patch (attached to this email): ----------------------------------------------------------- QD 1: IOPS=287k (*) QD 2: IOPS=334k QD 4: IOPS=330k QD 8: IOPS=334k For reference, with the same test script using the none scheduler: QD 1: IOPS=2172K QD 2: IOPS=1075K QD 4: IOPS=1075k QD 8: IOPS=1077k So the mq-deadline priority patch reduces performance by nearly half at high QD. With the modified patch, we are back to better numbers, but still a significant 20% drop at high QD. (*) Note: in all cases using the mq-deadline scheduler, for the first run at QD=1, I get this splat 100% of the time. [ 95.173889] watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/0:1H:757] [ 95.181351] Modules linked in: null_blk rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c nfnetlink sunrpc vfat fat iTCO_wdt iTCO_vendor_support ipmi_ssif x86_pkg_temp_thermal coretemp i2c_i801 acpi_ipmi bfq i2c_smbus ioatdma lpc_ich ipmi_si intel_pch_thermal dca ipmi_devintf ipmi_msghandler acpi_power_meter fuse ip_tables sd_mod ast i2c_algo_bit drm_vram_helper drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm_ttm_helper ttm drm i40e crct10dif_pclmul mpt3sas crc32_pclmul ahci ghash_clmulni_intel libahci raid_class scsi_transport_sas libata pkcs8_key_parser [ 95.252173] irq event stamp: 30500990 [ 95.255860] hardirqs last enabled at (30500989): [<ffffffff81910e2d>] _raw_spin_unlock_irqrestore+0x2d/0x40 [ 95.265735] hardirqs last disabled at (30500990): [<ffffffff819050cb>] sysvec_apic_timer_interrupt+0xb/0x90 [ 95.275520] softirqs last enabled at (30496338): [<ffffffff810b331f>] __irq_exit_rcu+0xbf/0xe0 [ 95.284259] softirqs last disabled at (30496333): [<ffffffff810b331f>] __irq_exit_rcu+0xbf/0xe0 [ 95.292994] CPU: 0 PID: 757 Comm: kworker/0:1H Not tainted 5.14.0-rc7+ #1334 [ 95.300076] Hardware name: Supermicro Super Server/X11DPL-i, BIOS 3.3 02/21/2020 [ 95.307504] Workqueue: kblockd blk_mq_run_work_fn [ 95.312243] RIP: 0010:_raw_spin_unlock_irqrestore+0x35/0x40 [ 95.317844] Code: c7 18 53 48 89 f3 48 8b 74 24 10 e8 35 82 80 ff 48 89 ef e8 9d ac 80 ff 80 e7 02 74 06 e8 23 33 8b ff fb 65 ff 0d 8b 5f 70 7e <5b> 5d c3 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 fd 65 ff [ 95.336680] RSP: 0018:ffff888448cefbb0 EFLAGS: 00000202 [ 95.341934] RAX: 0000000001d1687d RBX: 0000000000000287 RCX: 0000000000000006 [ 95.349103] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81910e2d [ 95.356270] RBP: ffff888192649218 R08: 0000000000000001 R09: 0000000000000001 [ 95.363437] R10: 0000000000000000 R11: 000000000000005c R12: 0000000000000000 [ 95.370604] R13: 0000000000000287 R14: ffff888192649218 R15: ffff88885fe68e80 [ 95.377771] FS: 0000000000000000(0000) GS:ffff88885fe00000(0000) knlGS:0000000000000000 [ 95.385901] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 95.391675] CR2: 00007f59bfe71f80 CR3: 000000074a91e005 CR4: 00000000007706f0 [ 95.398842] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 95.406009] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 95.413176] PKRU: 55555554 [ 95.415904] Call Trace: [ 95.418373] try_to_wake_up+0x268/0x7c0 [ 95.422238] blk_update_request+0x25b/0x420 [ 95.426452] blk_mq_end_request+0x1c/0x120 [ 95.430576] null_handle_cmd+0x12d/0x270 [null_blk] [ 95.435485] blk_mq_dispatch_rq_list+0x13c/0x7f0 [ 95.440130] ? sbitmap_get+0x86/0x190 [ 95.443826] __blk_mq_do_dispatch_sched+0xb5/0x2f0 [ 95.448653] __blk_mq_sched_dispatch_requests+0xf4/0x140 [ 95.453998] blk_mq_sched_dispatch_requests+0x30/0x60 [ 95.459083] __blk_mq_run_hw_queue+0x49/0x90 [ 95.463377] process_one_work+0x26c/0x570 [ 95.467421] worker_thread+0x55/0x3c0 [ 95.471103] ? process_one_work+0x570/0x570 [ 95.475313] kthread+0x140/0x160 [ 95.478567] ? set_kthread_struct+0x40/0x40 [ 95.482774] ret_from_fork+0x1f/0x30 -- Damien Le Moal Western Digital Research
From 2ac2af2b1316adc934d0e699985567ded595fe26 Mon Sep 17 00:00:00 2001 From: Zhen Lei <thunder.leizhen@xxxxxxxxxx> Date: Thu, 26 Aug 2021 22:40:39 +0800 Subject: [PATCH] block/mq-deadline: Speed up the dispatch of low-priority requests dd_queued() traverses the percpu variable for summation. The more cores, the higher the performance overhead. I currently have a 128-core board and this function takes 2.5 us. If the number of high-priority requests is small and the number of low- and medium-priority requests is large, the performance impact is significant. Let's maintain a non-percpu member variable 'nr_queued', which is incremented by 1 immediately following "inserted++" and decremented by 1 immediately following "completed++". Because both the judgment dd_queued() in dd_dispatch_request() and operation "inserted++" in dd_insert_request() are protected by dd->lock, lock protection needs to be added only in dd_finish_request(), which is unlikely to cause significant performance side effects. Tested on my 128-core board with two ssd disks. fio bs=4k rw=read iodepth=128 cpus_allowed=0-95 <others> Before: [183K/0/0 iops] [172K/0/0 iops] After: [258K/0/0 iops] [258K/0/0 iops] Fixes: fb926032b320 ("block/mq-deadline: Prioritize high-priority requests") Signed-off-by: Zhen Lei <thunder.leizhen@xxxxxxxxxx> Signed-off-by: Damien Le Moal <damien.lemoal@xxxxxxx> --- block/mq-deadline.c | 81 ++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a09761cbdf12..0d879f3ff340 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -79,6 +79,7 @@ struct dd_per_prio { struct list_head fifo_list[DD_DIR_COUNT]; /* Next request in FIFO order. Read, write or both are NULL. */ struct request *next_rq[DD_DIR_COUNT]; + unsigned int nr_queued; }; struct deadline_data { @@ -106,7 +107,6 @@ struct deadline_data { int aging_expire; spinlock_t lock; - spinlock_t zone_lock; }; /* Count one event of type 'event_type' and with I/O priority 'prio' */ @@ -276,10 +276,12 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, deadline_remove_request(rq->q, per_prio, rq); } -/* Number of requests queued for a given priority level. */ -static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +/* + * Number of requests queued for a given priority level. + */ +static inline u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); + return dd->per_prio[prio].nr_queued; } /* @@ -309,7 +311,6 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq; - unsigned long flags; if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; @@ -322,16 +323,12 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, * Look for a write request that can be dispatched, that is one with * an unlocked target zone. */ - spin_lock_irqsave(&dd->zone_lock, flags); list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) - goto out; + return rq; } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - return rq; + return NULL; } /* @@ -343,7 +340,6 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq; - unsigned long flags; rq = per_prio->next_rq[data_dir]; if (!rq) @@ -356,15 +352,13 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, * Look for a write request that can be dispatched, that is one with * an unlocked target zone. */ - spin_lock_irqsave(&dd->zone_lock, flags); while (rq) { if (blk_req_can_dispatch_to_zone(rq)) - break; + return rq; rq = deadline_latter_request(rq); } - spin_unlock_irqrestore(&dd->zone_lock, flags); - return rq; + return NULL; } /* @@ -497,8 +491,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) const u64 now_ns = ktime_get_ns(); struct request *rq = NULL; enum dd_prio prio; + unsigned long flags; + + spin_lock_irqsave(&dd->lock, flags); - spin_lock(&dd->lock); /* * Start with dispatching requests whose deadline expired more than * aging_expire jiffies ago. @@ -520,7 +516,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) } unlock: - spin_unlock(&dd->lock); + spin_unlock_irqrestore(&dd->lock, flags); return rq; } @@ -622,7 +618,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; dd->aging_expire = aging_expire; spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); q->elevator = eq; return 0; @@ -675,10 +670,11 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; + unsigned long flags; - spin_lock(&dd->lock); + spin_lock_irqsave(&dd->lock, flags); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); - spin_unlock(&dd->lock); + spin_unlock_irqrestore(&dd->lock, flags); if (free) blk_mq_free_request(free); @@ -690,7 +686,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + bool at_head, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -699,7 +695,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; - LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -712,14 +707,14 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); - if (blk_mq_sched_try_insert_merge(q, rq, &free)) { - blk_mq_free_requests(&free); + if (blk_mq_sched_try_insert_merge(q, rq, free)) return; - } + + per_prio = &dd->per_prio[prio]; + per_prio->nr_queued++; trace_block_rq_insert(rq); - per_prio = &dd->per_prio[prio]; if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); } else { @@ -747,16 +742,23 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + unsigned long flags; + LIST_HEAD(free); - spin_lock(&dd->lock); + spin_lock_irqsave(&dd->lock, flags); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, at_head); + dd_insert_request(hctx, rq, at_head, &free); + if (!list_empty(&free)) { + spin_unlock_irqrestore(&dd->lock, flags); + blk_mq_free_requests(&free); + spin_lock_irqsave(&dd->lock, flags); + } } - spin_unlock(&dd->lock); + spin_unlock_irqrestore(&dd->lock, flags); } /* @@ -790,18 +792,21 @@ static void dd_finish_request(struct request *rq) const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; + unsigned long flags; dd_count(dd, completed, prio); - if (blk_queue_is_zoned(q)) { - unsigned long flags; + spin_lock_irqsave(&dd->lock, flags); - spin_lock_irqsave(&dd->zone_lock, flags); + per_prio->nr_queued--; + + if (blk_queue_is_zoned(q)) { blk_req_zone_write_unlock(rq); if (!list_empty(&per_prio->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - spin_unlock_irqrestore(&dd->zone_lock, flags); } + + spin_unlock_irqrestore(&dd->lock, flags); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) @@ -899,7 +904,7 @@ static void *deadline_##name##_fifo_start(struct seq_file *m, \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - spin_lock(&dd->lock); \ + spin_lock_irq(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ @@ -919,7 +924,7 @@ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ - spin_unlock(&dd->lock); \ + spin_unlock_irq(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ @@ -1015,7 +1020,7 @@ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - spin_lock(&dd->lock); \ + spin_lock_irq(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ @@ -1035,7 +1040,7 @@ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ - spin_unlock(&dd->lock); \ + spin_unlock_irq(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ -- 2.31.1