Re: [Query] increased latency observed in cpu hotplug path

Akinobu Mita <akinobu.mita@xxxxxxxxx> · Thu, 22 Sep 2016 00:56:05 +0900

2016-09-21 23:06 GMT+09:00 Khan, Imran <kimran@xxxxxxxxxxxxxx>:

>>> commit 084ee793ec1ff4e2171b481642bfbdaa2e10664c
>>> Author: Imran Khan <kimran@xxxxxxxxxxxxxx>
>>> Date:   Thu Sep 15 16:44:02 2016 +0530
>>>
>>>     blk-mq: use static mapping
>>>
>>>     blk-mq layer performs a remapping between s/w and h/w contexts and also
>>>     between h/w contexts and CPUs, whenever a CPU hotplug event happens.
>>>     This remapping has to wait for queue freezing which may take tens of
>>>     miliseconds, resulting in a high latency in CPU hotplug path.
>>>     This patch makes the above mentioned mappings static so that we can
>>>     avoid remapping when CPU hotplug event happens and this results in
>>>     improved CPU hotplug latency.
>>>
>>>     Change-Id: Idf38cb6c4e78c91fda3c86608c6d0441f01ab435
>>>     Signed-off-by: Imran Khan <kimran@xxxxxxxxxxxxxx>
>>>
>>> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
>>> index 8764c24..85fdb88 100644
>>> --- a/block/blk-mq-cpumap.c
>>> +++ b/block/blk-mq-cpumap.c
>>> @@ -52,18 +52,13 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
>>>
>>>      queue = 0;
>>>      for_each_possible_cpu(i) {
>>> -            if (!cpumask_test_cpu(i, online_mask)) {
>>> -                    map[i] = 0;
>>> -                    continue;
>>> -            }
>>> -
>>>              /*
>>>               * Easy case - we have equal or more hardware queues. Or
>>>               * there are no thread siblings to take into account. Do
>>>               * 1:1 if enough, or sequential mapping if less.
>>>               */
>>> -            if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
>>> -                    map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
>>> +            if (nr_queues >= nr_cpu_ids) {
>>> +                    map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues, queue);
>>>                      queue++;
>>>                      continue;
>>>              }
>>> @@ -75,7 +70,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
>>>               */
>>>              first_sibling = get_first_sibling(i);
>>>              if (first_sibling == i) {
>>> -                    map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
>>> +                    map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues,
>>>                                                      queue);
>>>                      queue++;
>>>              } else
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 6d6f8fe..0753fdf 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -1783,10 +1783,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>>>              INIT_LIST_HEAD(&__ctx->rq_list);
>>>              __ctx->queue = q;
>>>
>>> -            /* If the cpu isn't online, the cpu is mapped to first hctx */
>>> -            if (!cpu_online(i))
>>> -                    continue;
>>> -
>>>              hctx = q->mq_ops->map_queue(q, i);
>>>
>>>              /*
>>> @@ -1820,12 +1816,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
>>>       * Map software to hardware queues
>>>       */
>>>      queue_for_each_ctx(q, ctx, i) {
>>> -            /* If the cpu isn't online, the cpu is mapped to first hctx */
>>> -            if (!cpumask_test_cpu(i, online_mask))
>>> -                    continue;
>>> -
>>>              hctx = q->mq_ops->map_queue(q, i);
>>> -            cpumask_set_cpu(i, hctx->cpumask);
>>> +            if (cpumask_test_cpu(i, online_mask))
>>> +                    cpumask_set_cpu(i, hctx->cpumask);
>>>              ctx->index_hw = hctx->nr_ctx;
>>>              hctx->ctxs[hctx->nr_ctx++] = ctx;
>>>      }
>>> @@ -1863,17 +1856,22 @@ static void blk_mq_map_swqueue(struct request_queue *q,
>>>
>>>              /*
>>>               * Initialize batch roundrobin counts
>>> +             * Set next_cpu for only those hctxs that have an online CPU
>>> +             * in their cpumask field. For hctxs that belong to few online
>>> +             * and few offline CPUs, this will always provide one CPU from
>>> +             * online ones. For hctxs belonging to all offline CPUs, their
>>> +             * cpumask will be updated in reinit_notify.
>>>               */
>>> -            hctx->next_cpu = cpumask_first(hctx->cpumask);
>>> -            hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>> +            if(cpumask_first(hctx->cpumask) < nr_cpu_ids) {
>>> +                    hctx->next_cpu = cpumask_first(hctx->cpumask);
>>> +                    hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>> +            }
>>>      }
>>>
>>>      queue_for_each_ctx(q, ctx, i) {
>>> -            if (!cpumask_test_cpu(i, online_mask))
>>> -                    continue;
>>> -
>>>              hctx = q->mq_ops->map_queue(q, i);
>>> -            cpumask_set_cpu(i, hctx->tags->cpumask);
>>> +            if (cpumask_test_cpu(i, online_mask))
>>> +                    cpumask_set_cpu(i, hctx->tags->cpumask);
>>>      }
>>>  }
>>>
>>> @@ -2101,31 +2099,12 @@ void blk_mq_free_queue(struct request_queue *q)
>>>      blk_mq_free_hw_queues(q, set);
>>>  }
>>>
>>> -/* Basically redo blk_mq_init_queue with queue frozen */
>>> -static void blk_mq_queue_reinit(struct request_queue *q,
>>> -                            const struct cpumask *online_mask)
>>> -{
>>> -    WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>>> -
>>> -    blk_mq_sysfs_unregister(q);
>>> -
>>> -    blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);

Now blk_mq_update_queue_map() is only used in block/blk-mq-cpumap.c,
so you can make blk_mq_update_queue_map() static function.

>>> -
>>> -    /*
>>> -     * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
>>> -     * we should change hctx numa_node according to new topology (this
>>> -     * involves free and re-allocate memory, worthy doing?)
>>> -     */
>>> -
>>> -    blk_mq_map_swqueue(q, online_mask);
>>> -
>>> -    blk_mq_sysfs_register(q);
>>> -}
>>> -
>>>  static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>>>                                    unsigned long action, void *hcpu)
>>>  {
>>>      struct request_queue *q;
>>> +    struct blk_mq_hw_ctx *hctx;
>>> +    int i;
>>>      int cpu = (unsigned long)hcpu;
>>>      /*
>>>       * New online cpumask which is going to be set in this hotplug event.
>>> @@ -2155,43 +2134,29 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>>>      case CPU_DEAD:
>>>      case CPU_UP_CANCELED:
>>>              cpumask_copy(&online_new, cpu_online_mask);
>>> +            list_for_each_entry(q, &all_q_list, all_q_node) {
>>> +                    queue_for_each_hw_ctx(q, hctx, i) {
>>> +                            cpumask_clear_cpu(cpu, hctx->cpumask);
>>> +                            cpumask_clear_cpu(cpu, hctx->tags->cpumask);
>>> +                    }
>>> +            }

Do we need to hold all_q_mutex while iterating through all_q_list?

>>>              break;
>>>      case CPU_UP_PREPARE:
>>>              cpumask_copy(&online_new, cpu_online_mask);
>>>              cpumask_set_cpu(cpu, &online_new);
>>> +            /* Update hctx->cpumask for newly onlined CPUs */
>>> +            list_for_each_entry(q, &all_q_list, all_q_node) {
>>> +                    queue_for_each_hw_ctx(q, hctx, i) {
>>> +                            cpumask_set_cpu(cpu, hctx->cpumask);
>>> +                            hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>> +                            cpumask_set_cpu(cpu, hctx->tags->cpumask);
>>> +                    }
>>> +            }
>>>              break;
>>>      default:
>>>              return NOTIFY_OK;
>>>      }
>>>
>>> -    mutex_lock(&all_q_mutex);
>>> -
>>> -    /*
>>> -     * We need to freeze and reinit all existing queues.  Freezing
>>> -     * involves synchronous wait for an RCU grace period and doing it
>>> -     * one by one may take a long time.  Start freezing all queues in
>>> -     * one swoop and then wait for the completions so that freezing can
>>> -     * take place in parallel.
>>> -     */
>>> -    list_for_each_entry(q, &all_q_list, all_q_node)
>>> -            blk_mq_freeze_queue_start(q);
>>> -    list_for_each_entry(q, &all_q_list, all_q_node) {
>>> -            blk_mq_freeze_queue_wait(q);
>>> -
>>> -            /*
>>> -             * timeout handler can't touch hw queue during the
>>> -             * reinitialization
>>> -             */
>>> -            del_timer_sync(&q->timeout);
>>> -    }
>>> -
>>> -    list_for_each_entry(q, &all_q_list, all_q_node)
>>> -            blk_mq_queue_reinit(q, &online_new);
>>> -
>>> -    list_for_each_entry(q, &all_q_list, all_q_node)
>>> -            blk_mq_unfreeze_queue(q);
>>> -
>>> -    mutex_unlock(&all_q_mutex);

This change makes 'online_new' cpumask useless in blk_mq_queue_reinit_notify(),
so you can remove it completely.

>>>      return NOTIFY_OK;
>>>  }
>>>
>>>
>>>
>>>
>> With the patch I have collected some numbers, running some android monkey tests along with
>> an app that does random hootplugging of CPUs. The numbers obtained are as below:
>>
>> CPU UP time(in micro seconds):
>>          default blk-mq: 58629 54207 61792 58458 59286
>> blk-mq with above patch: 2755  2588  3412  3239  3010
>>
>> CPU DOWN time(in micros seconds):
>>          default blk-mq: 95990  103379 103686 93214  128274
>> blk-mq with above patch: 67178  49876  70027  61261  59656
>>
> Could someone please review the patch and provide some feedback?
> Even if the feedback is about the approach we are trying here or some
> suggestions to move ahead in that direction.
>>>>> Did you tried Tejun's percpu_ref patchset that I said in the last email?
>>>>> (http://www.mail-archive.com/linux-kernel@xxxxxxxxxxxxxxx/msg984823.html)
>>>>>
>>>>>
>>>> I have tried the suggested patch but did not see any improvements in cpu
>>>> hotplug latencies
>>>>
>>>
>>>
>>
>>
>
>
> --
> Imran Khan
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html