On 9/21/2016 9:26 PM, Akinobu Mita wrote: > 2016-09-21 23:06 GMT+09:00 Khan, Imran <kimran@xxxxxxxxxxxxxx>: > >>>> commit 084ee793ec1ff4e2171b481642bfbdaa2e10664c >>>> Author: Imran Khan <kimran@xxxxxxxxxxxxxx> >>>> Date: Thu Sep 15 16:44:02 2016 +0530 >>>> >>>> blk-mq: use static mapping >>>> >>>> blk-mq layer performs a remapping between s/w and h/w contexts and also >>>> between h/w contexts and CPUs, whenever a CPU hotplug event happens. >>>> This remapping has to wait for queue freezing which may take tens of >>>> miliseconds, resulting in a high latency in CPU hotplug path. >>>> This patch makes the above mentioned mappings static so that we can >>>> avoid remapping when CPU hotplug event happens and this results in >>>> improved CPU hotplug latency. >>>> >>>> Change-Id: Idf38cb6c4e78c91fda3c86608c6d0441f01ab435 >>>> Signed-off-by: Imran Khan <kimran@xxxxxxxxxxxxxx> >>>> >>>> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c >>>> index 8764c24..85fdb88 100644 >>>> --- a/block/blk-mq-cpumap.c >>>> +++ b/block/blk-mq-cpumap.c >>>> @@ -52,18 +52,13 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, >>>> >>>> queue = 0; >>>> for_each_possible_cpu(i) { >>>> - if (!cpumask_test_cpu(i, online_mask)) { >>>> - map[i] = 0; >>>> - continue; >>>> - } >>>> - >>>> /* >>>> * Easy case - we have equal or more hardware queues. Or >>>> * there are no thread siblings to take into account. Do >>>> * 1:1 if enough, or sequential mapping if less. >>>> */ >>>> - if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { >>>> - map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); >>>> + if (nr_queues >= nr_cpu_ids) { >>>> + map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues, queue); >>>> queue++; >>>> continue; >>>> } >>>> @@ -75,7 +70,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, >>>> */ >>>> first_sibling = get_first_sibling(i); >>>> if (first_sibling == i) { >>>> - map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, >>>> + map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues, >>>> queue); >>>> queue++; >>>> } else >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index 6d6f8fe..0753fdf 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -1783,10 +1783,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, >>>> INIT_LIST_HEAD(&__ctx->rq_list); >>>> __ctx->queue = q; >>>> >>>> - /* If the cpu isn't online, the cpu is mapped to first hctx */ >>>> - if (!cpu_online(i)) >>>> - continue; >>>> - >>>> hctx = q->mq_ops->map_queue(q, i); >>>> >>>> /* >>>> @@ -1820,12 +1816,9 @@ static void blk_mq_map_swqueue(struct request_queue *q, >>>> * Map software to hardware queues >>>> */ >>>> queue_for_each_ctx(q, ctx, i) { >>>> - /* If the cpu isn't online, the cpu is mapped to first hctx */ >>>> - if (!cpumask_test_cpu(i, online_mask)) >>>> - continue; >>>> - >>>> hctx = q->mq_ops->map_queue(q, i); >>>> - cpumask_set_cpu(i, hctx->cpumask); >>>> + if (cpumask_test_cpu(i, online_mask)) >>>> + cpumask_set_cpu(i, hctx->cpumask); >>>> ctx->index_hw = hctx->nr_ctx; >>>> hctx->ctxs[hctx->nr_ctx++] = ctx; >>>> } >>>> @@ -1863,17 +1856,22 @@ static void blk_mq_map_swqueue(struct request_queue *q, >>>> >>>> /* >>>> * Initialize batch roundrobin counts >>>> + * Set next_cpu for only those hctxs that have an online CPU >>>> + * in their cpumask field. For hctxs that belong to few online >>>> + * and few offline CPUs, this will always provide one CPU from >>>> + * online ones. For hctxs belonging to all offline CPUs, their >>>> + * cpumask will be updated in reinit_notify. >>>> */ >>>> - hctx->next_cpu = cpumask_first(hctx->cpumask); >>>> - hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; >>>> + if(cpumask_first(hctx->cpumask) < nr_cpu_ids) { >>>> + hctx->next_cpu = cpumask_first(hctx->cpumask); >>>> + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; >>>> + } >>>> } >>>> >>>> queue_for_each_ctx(q, ctx, i) { >>>> - if (!cpumask_test_cpu(i, online_mask)) >>>> - continue; >>>> - >>>> hctx = q->mq_ops->map_queue(q, i); >>>> - cpumask_set_cpu(i, hctx->tags->cpumask); >>>> + if (cpumask_test_cpu(i, online_mask)) >>>> + cpumask_set_cpu(i, hctx->tags->cpumask); >>>> } >>>> } >>>> >>>> @@ -2101,31 +2099,12 @@ void blk_mq_free_queue(struct request_queue *q) >>>> blk_mq_free_hw_queues(q, set); >>>> } >>>> >>>> -/* Basically redo blk_mq_init_queue with queue frozen */ >>>> -static void blk_mq_queue_reinit(struct request_queue *q, >>>> - const struct cpumask *online_mask) >>>> -{ >>>> - WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); >>>> - >>>> - blk_mq_sysfs_unregister(q); >>>> - >>>> - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); > > Now blk_mq_update_queue_map() is only used in block/blk-mq-cpumap.c, > so you can make blk_mq_update_queue_map() static function. > >>>> - >>>> - /* >>>> - * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe >>>> - * we should change hctx numa_node according to new topology (this >>>> - * involves free and re-allocate memory, worthy doing?) >>>> - */ >>>> - >>>> - blk_mq_map_swqueue(q, online_mask); >>>> - >>>> - blk_mq_sysfs_register(q); >>>> -} >>>> - >>>> static int blk_mq_queue_reinit_notify(struct notifier_block *nb, >>>> unsigned long action, void *hcpu) >>>> { >>>> struct request_queue *q; >>>> + struct blk_mq_hw_ctx *hctx; >>>> + int i; >>>> int cpu = (unsigned long)hcpu; >>>> /* >>>> * New online cpumask which is going to be set in this hotplug event. >>>> @@ -2155,43 +2134,29 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, >>>> case CPU_DEAD: >>>> case CPU_UP_CANCELED: >>>> cpumask_copy(&online_new, cpu_online_mask); >>>> + list_for_each_entry(q, &all_q_list, all_q_node) { >>>> + queue_for_each_hw_ctx(q, hctx, i) { >>>> + cpumask_clear_cpu(cpu, hctx->cpumask); >>>> + cpumask_clear_cpu(cpu, hctx->tags->cpumask); >>>> + } >>>> + } > > Do we need to hold all_q_mutex while iterating through all_q_list? > Yes, we need to hold all_q_mutex here. >>>> break; >>>> case CPU_UP_PREPARE: >>>> cpumask_copy(&online_new, cpu_online_mask); >>>> cpumask_set_cpu(cpu, &online_new); >>>> + /* Update hctx->cpumask for newly onlined CPUs */ >>>> + list_for_each_entry(q, &all_q_list, all_q_node) { >>>> + queue_for_each_hw_ctx(q, hctx, i) { >>>> + cpumask_set_cpu(cpu, hctx->cpumask); >>>> + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; >>>> + cpumask_set_cpu(cpu, hctx->tags->cpumask); >>>> + } >>>> + } >>>> break; >>>> default: >>>> return NOTIFY_OK; >>>> } >>>> >>>> - mutex_lock(&all_q_mutex); >>>> - >>>> - /* >>>> - * We need to freeze and reinit all existing queues. Freezing >>>> - * involves synchronous wait for an RCU grace period and doing it >>>> - * one by one may take a long time. Start freezing all queues in >>>> - * one swoop and then wait for the completions so that freezing can >>>> - * take place in parallel. >>>> - */ >>>> - list_for_each_entry(q, &all_q_list, all_q_node) >>>> - blk_mq_freeze_queue_start(q); >>>> - list_for_each_entry(q, &all_q_list, all_q_node) { >>>> - blk_mq_freeze_queue_wait(q); >>>> - >>>> - /* >>>> - * timeout handler can't touch hw queue during the >>>> - * reinitialization >>>> - */ >>>> - del_timer_sync(&q->timeout); >>>> - } >>>> - >>>> - list_for_each_entry(q, &all_q_list, all_q_node) >>>> - blk_mq_queue_reinit(q, &online_new); >>>> - >>>> - list_for_each_entry(q, &all_q_list, all_q_node) >>>> - blk_mq_unfreeze_queue(q); >>>> - >>>> - mutex_unlock(&all_q_mutex); > > This change makes 'online_new' cpumask useless in blk_mq_queue_reinit_notify(), > so you can remove it completely. > Will remove this in the new patch set. Thanks for the review comments. I will take into account the review comments and send a new patch set. >>>> return NOTIFY_OK; >>>> } >>>> >>>> >>>> >>>> >>> With the patch I have collected some numbers, running some android monkey tests along with >>> an app that does random hootplugging of CPUs. The numbers obtained are as below: >>> >>> CPU UP time(in micro seconds): >>> default blk-mq: 58629 54207 61792 58458 59286 >>> blk-mq with above patch: 2755 2588 3412 3239 3010 >>> >>> CPU DOWN time(in micros seconds): >>> default blk-mq: 95990 103379 103686 93214 128274 >>> blk-mq with above patch: 67178 49876 70027 61261 59656 >>> >> Could someone please review the patch and provide some feedback? >> Even if the feedback is about the approach we are trying here or some >> suggestions to move ahead in that direction. >>>>>> Did you tried Tejun's percpu_ref patchset that I said in the last email? >>>>>> (http://www.mail-archive.com/linux-kernel@xxxxxxxxxxxxxxx/msg984823.html) >>>>>> >>>>>> >>>>> I have tried the suggested patch but did not see any improvements in cpu >>>>> hotplug latencies >>>>> >>>> >>>> >>> >>> >> >> >> -- >> Imran Khan >> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation -- Imran Khan QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation -- To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html