> On 19 Mar 2019, at 14.33, Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> wrote: > > > > On 3/18/19 7:22 PM, Javier González wrote: >>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@xxxxxxxxxxxxx> wrote: >>> >>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski >>> <marcin.dziegielewski@xxxxxxxxx> wrote: >>>> On 3/14/19 3:22 PM, Javier González wrote: >>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@xxxxxxxxxxx> wrote: >>>>>> >>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> wrote: >>>>>>> >>>>>>> In some cases write thread migration between cpus can cause >>>>>>> sending writes in improper order and in consequence a lot of >>>>>>> errors from device. >>>>>>> >>>>>>> Write thread affinity to particular cpu prevent before it. >>>>>>> >>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@xxxxxxxxx> >>>>>>> --- >>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >>>>>>> drivers/lightnvm/pblk.h | 1 + >>>>>>> drivers/nvme/host/lightnvm.c | 1 + >>>>>>> include/linux/lightnvm.h | 2 ++ >>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-) >>>>>>> >>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >>>>>>> index 81e8ed4..bd25004 100644 >>>>>>> --- a/drivers/lightnvm/pblk-init.c >>>>>>> +++ b/drivers/lightnvm/pblk-init.c >>>>>>> @@ -21,6 +21,8 @@ >>>>>>> >>>>>>> #include "pblk.h" >>>>>>> #include "pblk-trace.h" >>>>>>> +#include <linux/cpumask.h> >>>>>>> +#include <linux/numa.h> >>>>>>> >>>>>>> static unsigned int write_buffer_size; >>>>>>> >>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches { >>>>>>> >>>>>>> struct bio_set pblk_bio_set; >>>>>>> >>>>>>> +cpumask_t free_cpumask; >>>>>>> + >>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >>>>>>> struct bio *bio) >>>>>>> { >>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >>>>>>> >>>>>>> static int pblk_writer_init(struct pblk *pblk) >>>>>>> { >>>>>>> + cpumask_t tmp_cpumask, cpumask; >>>>>>> + int cpu; >>>>>>> + >>>>>>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >>>>>>> if (IS_ERR(pblk->writer_ts)) { >>>>>>> int err = PTR_ERR(pblk->writer_ts); >>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >>>>>>> return err; >>>>>>> } >>>>>>> >>>>>>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >>>>>>> + cpu_online_mask); >>>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>>> + >>>>>>> + if (!cpumask_weight(&free_cpumask)) { >>>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>>> + } >>>>>>> + >>>>>>> + cpu = cpumask_last(&cpumask); >>>>>>> + >>>>>>> + kthread_bind(pblk->writer_ts, cpu); >>>>>>> + >>>>>>> + cpumask_clear_cpu(cpu, &free_cpumask); >>>>>>> + pblk->writer_cpu = cpu; >>>>>>> + >>>>>>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >>>>>>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >>>>>>> >>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >>>>>>> "Stopping not fully synced write buffer\n"); >>>>>>> >>>>>>> del_timer_sync(&pblk->wtimer); >>>>>>> - if (pblk->writer_ts) >>>>>>> + if (pblk->writer_ts) { >>>>>>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >>>>>>> kthread_stop(pblk->writer_ts); >>>>>>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >>>>>>> + } >>>>>>> } >>>>>>> >>>>>>> static void pblk_free(struct pblk *pblk) >>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >>>>>>> { >>>>>>> int ret; >>>>>>> >>>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>>> + >>>>>>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >>>>>>> if (ret) >>>>>>> return ret; >>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >>>>>>> index 381f074..650f983 100644 >>>>>>> --- a/drivers/lightnvm/pblk.h >>>>>>> +++ b/drivers/lightnvm/pblk.h >>>>>>> @@ -690,6 +690,7 @@ struct pblk { >>>>>>> atomic_t inflight_io; /* General inflight I/O counter */ >>>>>>> >>>>>>> struct task_struct *writer_ts; >>>>>>> + int writer_cpu; >>>>>>> >>>>>>> /* Simple translation map of logical addresses to physical addresses. >>>>>>> * The logical addresses is known by the host system, while the physical >>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >>>>>>> index 949e29e..971a19f 100644 >>>>>>> --- a/drivers/nvme/host/lightnvm.c >>>>>>> +++ b/drivers/nvme/host/lightnvm.c >>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >>>>>>> memcpy(dev->name, disk_name, DISK_NAME_LEN); >>>>>>> dev->ops = &nvme_nvm_dev_ops; >>>>>>> dev->private_data = ns; >>>>>>> + dev->node = node; >>>>>>> ns->ndev = dev; >>>>>>> >>>>>>> return nvm_register(dev); >>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >>>>>>> index 5d865a5..312029e 100644 >>>>>>> --- a/include/linux/lightnvm.h >>>>>>> +++ b/include/linux/lightnvm.h >>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev { >>>>>>> char name[DISK_NAME_LEN]; >>>>>>> void *private_data; >>>>>>> >>>>>>> + int node; >>>>>>> + >>>>>>> void *rmap; >>>>>>> >>>>>>> struct mutex mlock; >>>>>>> -- >>>>>>> 1.8.3.1 >>>>>> >>>>>> We have a per-CPU semaphore that only allows to send a single I/O in >>>>>> order to prevent write pointer violations. Are you seeing this error, or >>>>>> is it theoretical? >>>>>> >>>>>> Javier >>>>> >>>>> I meant per PU (parallel unit)… >>>> >>>> You are right, for current upstream implementation it's theoretical. I >>>> saw it after some experimental changes in lightnvm layer. >>>> >>>> But I see one more potential benefit from this patch (correct me if I'm >>>> wrong), after this patch we will be sure that write thread is working on >>>> cpu from right numa node for device. Now I think, we can also take under >>>> consideration binding to whole set of cpus from right numa node instead >>>> of particular one. >> I can see what you mean, but I think we are better relying on the >> scheduler balancing the NUMA nodes rather than pinning the write thread >> to specific CPUs. Are we doing something like this in the writeback >> path? > I have checked wrtieback path and I didn't find nothing similar, so please disregard this patch. > > Marcin > Don’t get me wrong. I think it is a good motivation, but maybe it should be implemented at a different level. Thanks, Javier
Attachment:
signature.asc
Description: Message signed with OpenPGP