On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> wrote: > > On 3/14/19 3:22 PM, Javier González wrote: > > > >> On 14 Mar 2019, at 07.22, Javier González <javier@xxxxxxxxxxx> wrote: > >> > >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> wrote: > >>> > >>> In some cases write thread migration between cpus can cause > >>> sending writes in improper order and in consequence a lot of > >>> errors from device. > >>> > >>> Write thread affinity to particular cpu prevent before it. > >>> > >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> > >>> --- > >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- > >>> drivers/lightnvm/pblk.h | 1 + > >>> drivers/nvme/host/lightnvm.c | 1 + > >>> include/linux/lightnvm.h | 2 ++ > >>> 4 files changed, 33 insertions(+), 1 deletion(-) > >>> > >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c > >>> index 81e8ed4..bd25004 100644 > >>> --- a/drivers/lightnvm/pblk-init.c > >>> +++ b/drivers/lightnvm/pblk-init.c > >>> @@ -21,6 +21,8 @@ > >>> > >>> #include "pblk.h" > >>> #include "pblk-trace.h" > >>> +#include <linux/cpumask.h> > >>> +#include <linux/numa.h> > >>> > >>> static unsigned int write_buffer_size; > >>> > >>> @@ -47,6 +49,8 @@ struct pblk_global_caches { > >>> > >>> struct bio_set pblk_bio_set; > >>> > >>> +cpumask_t free_cpumask; > >>> + > >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, > >>> struct bio *bio) > >>> { > >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) > >>> > >>> static int pblk_writer_init(struct pblk *pblk) > >>> { > >>> + cpumask_t tmp_cpumask, cpumask; > >>> + int cpu; > >>> + > >>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); > >>> if (IS_ERR(pblk->writer_ts)) { > >>> int err = PTR_ERR(pblk->writer_ts); > >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) > >>> return err; > >>> } > >>> > >>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), > >>> + cpu_online_mask); > >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > >>> + > >>> + if (!cpumask_weight(&free_cpumask)) { > >>> + free_cpumask = CPU_MASK_ALL; > >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > >>> + } > >>> + > >>> + cpu = cpumask_last(&cpumask); > >>> + > >>> + kthread_bind(pblk->writer_ts, cpu); > >>> + > >>> + cpumask_clear_cpu(cpu, &free_cpumask); > >>> + pblk->writer_cpu = cpu; > >>> + > >>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); > >>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); > >>> > >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) > >>> "Stopping not fully synced write buffer\n"); > >>> > >>> del_timer_sync(&pblk->wtimer); > >>> - if (pblk->writer_ts) > >>> + if (pblk->writer_ts) { > >>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); > >>> kthread_stop(pblk->writer_ts); > >>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); > >>> + } > >>> } > >>> > >>> static void pblk_free(struct pblk *pblk) > >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) > >>> { > >>> int ret; > >>> > >>> + free_cpumask = CPU_MASK_ALL; > >>> + > >>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); > >>> if (ret) > >>> return ret; > >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h > >>> index 381f074..650f983 100644 > >>> --- a/drivers/lightnvm/pblk.h > >>> +++ b/drivers/lightnvm/pblk.h > >>> @@ -690,6 +690,7 @@ struct pblk { > >>> atomic_t inflight_io; /* General inflight I/O counter */ > >>> > >>> struct task_struct *writer_ts; > >>> + int writer_cpu; > >>> > >>> /* Simple translation map of logical addresses to physical addresses. > >>> * The logical addresses is known by the host system, while the physical > >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c > >>> index 949e29e..971a19f 100644 > >>> --- a/drivers/nvme/host/lightnvm.c > >>> +++ b/drivers/nvme/host/lightnvm.c > >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) > >>> memcpy(dev->name, disk_name, DISK_NAME_LEN); > >>> dev->ops = &nvme_nvm_dev_ops; > >>> dev->private_data = ns; > >>> + dev->node = node; > >>> ns->ndev = dev; > >>> > >>> return nvm_register(dev); > >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h > >>> index 5d865a5..312029e 100644 > >>> --- a/include/linux/lightnvm.h > >>> +++ b/include/linux/lightnvm.h > >>> @@ -427,6 +427,8 @@ struct nvm_dev { > >>> char name[DISK_NAME_LEN]; > >>> void *private_data; > >>> > >>> + int node; > >>> + > >>> void *rmap; > >>> > >>> struct mutex mlock; > >>> -- > >>> 1.8.3.1 > >> > >> We have a per-CPU semaphore that only allows to send a single I/O in > >> order to prevent write pointer violations. Are you seeing this error, or > >> is it theoretical? > >> > >> Javier > > > > I meant per PU (parallel unit)… > > > > You are right, for current upstream implementation it's theoretical. I > saw it after some experimental changes in lightnvm layer. > > But I see one more potential benefit from this patch (correct me if I'm > wrong), after this patch we will be sure that write thread is working on > cpu from right numa node for device. Now I think, we can also take under > consideration binding to whole set of cpus from right numa node instead > of particular one. If this is just a potential benefit, I'd rather not do this, at least not until we have numbers on any performance benefits and an analysis of any trade offs. Thanks, Hans > > Marcin