Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
<marcin.dziegielewski@xxxxxxxxx> wrote:
>
> On 3/14/19 3:22 PM, Javier González wrote:
> >
> >> On 14 Mar 2019, at 07.22, Javier González <javier@xxxxxxxxxxx> wrote:
> >>
> >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> wrote:
> >>>
> >>> In some cases write thread migration between cpus can cause
> >>> sending writes in improper order and in consequence a lot of
> >>> errors from device.
> >>>
> >>> Write thread affinity to particular cpu prevent before it.
> >>>
> >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx>
> >>> ---
> >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
> >>> drivers/lightnvm/pblk.h      |  1 +
> >>> drivers/nvme/host/lightnvm.c |  1 +
> >>> include/linux/lightnvm.h     |  2 ++
> >>> 4 files changed, 33 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> >>> index 81e8ed4..bd25004 100644
> >>> --- a/drivers/lightnvm/pblk-init.c
> >>> +++ b/drivers/lightnvm/pblk-init.c
> >>> @@ -21,6 +21,8 @@
> >>>
> >>> #include "pblk.h"
> >>> #include "pblk-trace.h"
> >>> +#include <linux/cpumask.h>
> >>> +#include <linux/numa.h>
> >>>
> >>> static unsigned int write_buffer_size;
> >>>
> >>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
> >>>
> >>> struct bio_set pblk_bio_set;
> >>>
> >>> +cpumask_t free_cpumask;
> >>> +
> >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
> >>>                       struct bio *bio)
> >>> {
> >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
> >>>
> >>> static int pblk_writer_init(struct pblk *pblk)
> >>> {
> >>> +   cpumask_t tmp_cpumask, cpumask;
> >>> +   int cpu;
> >>> +
> >>>     pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
> >>>     if (IS_ERR(pblk->writer_ts)) {
> >>>             int err = PTR_ERR(pblk->writer_ts);
> >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
> >>>             return err;
> >>>     }
> >>>
> >>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
> >>> +                   cpu_online_mask);
> >>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +
> >>> +   if (!cpumask_weight(&free_cpumask)) {
> >>> +           free_cpumask = CPU_MASK_ALL;
> >>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +   }
> >>> +
> >>> +   cpu = cpumask_last(&cpumask);
> >>> +
> >>> +   kthread_bind(pblk->writer_ts, cpu);
> >>> +
> >>> +   cpumask_clear_cpu(cpu, &free_cpumask);
> >>> +   pblk->writer_cpu = cpu;
> >>> +
> >>>     timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
> >>>     mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
> >>>
> >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
> >>>                     "Stopping not fully synced write buffer\n");
> >>>
> >>>     del_timer_sync(&pblk->wtimer);
> >>> -   if (pblk->writer_ts)
> >>> +   if (pblk->writer_ts) {
> >>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
> >>>             kthread_stop(pblk->writer_ts);
> >>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
> >>> +   }
> >>> }
> >>>
> >>> static void pblk_free(struct pblk *pblk)
> >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
> >>> {
> >>>     int ret;
> >>>
> >>> +   free_cpumask = CPU_MASK_ALL;
> >>> +
> >>>     ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
> >>>     if (ret)
> >>>             return ret;
> >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> >>> index 381f074..650f983 100644
> >>> --- a/drivers/lightnvm/pblk.h
> >>> +++ b/drivers/lightnvm/pblk.h
> >>> @@ -690,6 +690,7 @@ struct pblk {
> >>>     atomic_t inflight_io;           /* General inflight I/O counter */
> >>>
> >>>     struct task_struct *writer_ts;
> >>> +   int writer_cpu;
> >>>
> >>>     /* Simple translation map of logical addresses to physical addresses.
> >>>      * The logical addresses is known by the host system, while the physical
> >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
> >>> index 949e29e..971a19f 100644
> >>> --- a/drivers/nvme/host/lightnvm.c
> >>> +++ b/drivers/nvme/host/lightnvm.c
> >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
> >>>     memcpy(dev->name, disk_name, DISK_NAME_LEN);
> >>>     dev->ops = &nvme_nvm_dev_ops;
> >>>     dev->private_data = ns;
> >>> +   dev->node = node;
> >>>     ns->ndev = dev;
> >>>
> >>>     return nvm_register(dev);
> >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> >>> index 5d865a5..312029e 100644
> >>> --- a/include/linux/lightnvm.h
> >>> +++ b/include/linux/lightnvm.h
> >>> @@ -427,6 +427,8 @@ struct nvm_dev {
> >>>     char name[DISK_NAME_LEN];
> >>>     void *private_data;
> >>>
> >>> +   int node;
> >>> +
> >>>     void *rmap;
> >>>
> >>>     struct mutex mlock;
> >>> --
> >>> 1.8.3.1
> >>
> >> We have a per-CPU semaphore that only allows to send a single I/O in
> >> order to prevent write pointer violations. Are you seeing this error, or
> >> is it theoretical?
> >>
> >> Javier
> >
> > I meant per PU (parallel unit)…
> >
>
> You are right, for current upstream implementation it's theoretical. I
> saw it after some experimental changes in lightnvm layer.
>
> But I see one more potential benefit from this patch (correct me if I'm
> wrong), after this patch we will be sure that write thread is working on
> cpu from right numa node for device. Now I think, we can also take under
> consideration binding to whole set of cpus from right numa node instead
> of particular one.

If this is just a potential benefit, I'd rather not do this, at least
not until we have numbers on any performance benefits and an analysis
of any trade offs.

Thanks,
Hans
>
> Marcin




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux