I had another implementation of raid5 mult-thread. The basic idea is to record stripe submitted CPU. Each cpu has a thread, which only handles stripes submitted from this cpu. Dan mentioned similar idea several day ago. I used to think we need make conf->device_lock per-cpu to make this work, but turn out that isn't required. Simply using percpu list and still using global device_lock works here. Performance is good too. This is a RFC patch, I'll resubmit in a better reviewable way if you like the idea. --- drivers/md/md.c | 7 + drivers/md/md.h | 7 + drivers/md/multipath.c | 3 drivers/md/raid1.c | 3 drivers/md/raid10.c | 3 drivers/md/raid5.c | 182 +++++++++++++++++++++++++++++++++++-------------- drivers/md/raid5.h | 4 - 7 files changed, 148 insertions(+), 61 deletions(-) Index: linux/drivers/md/raid5.c =================================================================== --- linux.orig/drivers/md/raid5.c 2012-07-09 01:25:37.522848182 -0600 +++ linux/drivers/md/raid5.c 2012-07-09 01:27:43.202847084 -0600 @@ -208,8 +208,17 @@ static void handle_release_stripe(struct sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + int cpu = sh->cpu; + struct raid5_percpu *percpu; + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + percpu = per_cpu_ptr(conf->percpu, cpu); clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); + list_add_tail(&sh->lru, &percpu->handle_list); + md_wakeup_thread(percpu->aux_thread); + return; } md_wakeup_thread(conf->mddev->thread); } else { @@ -354,6 +363,7 @@ static void init_stripe(struct stripe_he raid5_build_block(sh, i, previous); } insert_hash(conf, sh); + sh->cpu = smp_processor_id(); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -3646,12 +3656,19 @@ static void raid5_activate_delayed(struc while (!list_empty(&conf->delayed_list)) { struct list_head *l = conf->delayed_list.next; struct stripe_head *sh; + int cpu; sh = list_entry(l, struct stripe_head, lru); list_del_init(l); clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); + cpu = sh->cpu; + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + md_wakeup_thread(per_cpu_ptr(conf->percpu, cpu)->aux_thread); } } } @@ -3924,18 +3941,20 @@ static int chunk_aligned_read(struct mdd * head of the hold_list has changed, i.e. the head was promoted to the * handle_list. */ -static struct stripe_head *__get_priority_stripe(struct r5conf *conf) +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int cpu) { - struct stripe_head *sh; + struct stripe_head *sh = NULL, *tmp; + struct list_head *handle_list = + &per_cpu_ptr(conf->percpu, cpu)->handle_list; pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(handle_list) ? "empty" : "busy", list_empty(&conf->hold_list) ? "empty" : "busy", atomic_read(&conf->pending_full_writes), conf->bypass_count); - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + if (!list_empty(handle_list)) { + sh = list_entry(handle_list->next, typeof(*sh), lru); if (list_empty(&conf->hold_list)) conf->bypass_count = 0; @@ -3953,12 +3972,20 @@ static struct stripe_head *__get_priorit ((conf->bypass_threshold && conf->bypass_count > conf->bypass_threshold) || atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (tmp->cpu == cpu || !cpu_online(tmp->cpu)) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + if (!sh) return NULL; list_del_init(&sh->lru); @@ -4551,13 +4578,13 @@ static int retry_aligned_read(struct r5 } #define MAX_STRIPE_BATCH 8 -static int handle_active_stripes(struct r5conf *conf) +static int handle_active_stripes(struct r5conf *conf, int cpu) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH && - (sh = __get_priority_stripe(conf)) != NULL) + (sh = __get_priority_stripe(conf, cpu)) != NULL) batch[batch_size++] = sh; if (batch_size == 0) @@ -4575,6 +4602,35 @@ static int handle_active_stripes(struct return batch_size; } +static void raid5auxd(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + struct blk_plug plug; + int handled; + int cpu = (long)thread->thread_data; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + pr_debug("+++ raid5auxd active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size; + + batch_size = handle_active_stripes(conf, cpu); + if (!batch_size) + break; + handled += batch_size; + } + + spin_unlock_irq(&conf->device_lock); + blk_finish_plug(&plug); + + pr_debug("--- raid5auxd inactive\n"); +} + /* * This is our raid5 kernel thread. * @@ -4582,11 +4638,13 @@ static int handle_active_stripes(struct * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; + struct bio *bio; pr_debug("+++ raid5d active\n"); @@ -4595,43 +4653,34 @@ static void raid5d(struct mddev *mddev) blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); - while (1) { - struct bio *bio; - int batch_size; - if (atomic_read(&mddev->plug_cnt) == 0 && + if (atomic_read(&mddev->plug_cnt) == 0 && !list_empty(&conf->bitmap_list)) { - /* Now is a good time to flush some bitmap updates */ - conf->seq_flush++; - spin_unlock_irq(&conf->device_lock); - bitmap_unplug(mddev->bitmap); - spin_lock_irq(&conf->device_lock); - conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); - } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); - - while ((bio = remove_bio_from_retry(conf))) { - int ok; - spin_unlock_irq(&conf->device_lock); - ok = retry_aligned_read(conf, bio); - spin_lock_irq(&conf->device_lock); - if (!ok) - break; - handled++; - } + /* Now is a good time to flush some bitmap updates */ + conf->seq_flush++; + spin_unlock_irq(&conf->device_lock); + bitmap_unplug(mddev->bitmap); + spin_lock_irq(&conf->device_lock); + conf->seq_write = conf->seq_flush; + activate_bit_delay(conf); + } + if (atomic_read(&mddev->plug_cnt) == 0) + raid5_activate_delayed(conf); - batch_size = handle_active_stripes(conf); - if (!batch_size) + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) break; - handled += batch_size; + handled++; + } - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { - spin_unlock_irq(&conf->device_lock); - md_check_recovery(mddev); - spin_lock_irq(&conf->device_lock); - } + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { + spin_unlock_irq(&conf->device_lock); + md_check_recovery(mddev); + spin_lock_irq(&conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -4791,6 +4840,7 @@ static void raid5_free_percpu(struct r5c percpu = per_cpu_ptr(conf->percpu, cpu); safe_put_page(percpu->spare_page); kfree(percpu->scribble); + md_unregister_thread(&percpu->aux_thread); } #ifdef CONFIG_HOTPLUG_CPU unregister_cpu_notifier(&conf->cpu_notify); @@ -4815,6 +4865,7 @@ static int raid456_cpu_notify(struct not { struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); long cpu = (long)hcpu; + long anycpu; struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); switch (action) { @@ -4824,8 +4875,18 @@ static int raid456_cpu_notify(struct not percpu->spare_page = alloc_page(GFP_KERNEL); if (!percpu->scribble) percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + if (!percpu->aux_thread) { + char name[10]; + + snprintf(name, 10, "aux%ld", cpu); + percpu->aux_thread = md_register_thread(raid5auxd, + conf->mddev, name); + if (percpu->aux_thread) + percpu->aux_thread->thread_data = (void *)cpu; + INIT_LIST_HEAD(&(percpu->handle_list)); + } - if (!percpu->scribble || + if (!percpu->scribble || !percpu->aux_thread || (conf->level == 6 && !percpu->spare_page)) { safe_put_page(percpu->spare_page); kfree(percpu->scribble); @@ -4836,6 +4897,14 @@ static int raid456_cpu_notify(struct not break; case CPU_DEAD: case CPU_DEAD_FROZEN: + md_unregister_thread(&percpu->aux_thread); + + spin_lock_irq(&conf->device_lock); + anycpu = cpumask_any(cpu_online_mask); + list_splice_tail_init(&percpu->handle_list, + &per_cpu_ptr(conf->percpu, anycpu)->handle_list); + spin_unlock_irq(&conf->device_lock); + safe_put_page(percpu->spare_page); kfree(percpu->scribble); percpu->spare_page = NULL; @@ -4864,20 +4933,32 @@ static int raid5_alloc_percpu(struct r5c get_online_cpus(); err = 0; for_each_present_cpu(cpu) { + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + char name[10]; + if (conf->level == 6) { spare_page = alloc_page(GFP_KERNEL); if (!spare_page) { err = -ENOMEM; break; } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; + percpu->spare_page = spare_page; } scribble = kmalloc(conf->scribble_len, GFP_KERNEL); if (!scribble) { err = -ENOMEM; break; } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; + percpu->scribble = scribble; + snprintf(name, 10, "aux%ld", cpu); + percpu->aux_thread = md_register_thread(raid5auxd, conf->mddev, + name); + if (!percpu->aux_thread) { + err = -ENOMEM; + break; + } + percpu->aux_thread->thread_data = (void *)cpu; + INIT_LIST_HEAD(&(percpu->handle_list)); } #ifdef CONFIG_HOTPLUG_CPU conf->cpu_notify.notifier_call = raid456_cpu_notify; @@ -4932,7 +5013,6 @@ static struct r5conf *setup_conf(struct spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); Index: linux/drivers/md/raid5.h =================================================================== --- linux.orig/drivers/md/raid5.h 2012-07-09 01:25:37.492848182 -0600 +++ linux/drivers/md/raid5.h 2012-07-09 01:27:43.202847084 -0600 @@ -211,6 +211,7 @@ struct stripe_head { enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; + int cpu; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -395,7 +396,6 @@ struct r5conf { * but is closest to zero. */ - struct list_head handle_list; /* stripes needing handling */ struct list_head hold_list; /* preread ready stripes */ struct list_head delayed_list; /* stripes that have plugged requests */ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ @@ -431,6 +431,8 @@ struct r5conf { * lists and performing address * conversions */ + struct list_head handle_list; /*stripes needing handling */ + struct md_thread *aux_thread; } __percpu *percpu; size_t scribble_len; /* size of scribble region must be * associated with conf to handle Index: linux/drivers/md/md.c =================================================================== --- linux.orig/drivers/md/md.c 2012-07-09 01:25:37.502848182 -0600 +++ linux/drivers/md/md.c 2012-07-09 01:27:43.202847084 -0600 @@ -6715,7 +6715,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6730,7 +6730,7 @@ void md_wakeup_thread(struct md_thread * } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, +struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -7280,8 +7280,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; Index: linux/drivers/md/md.h =================================================================== --- linux.orig/drivers/md/md.h 2012-07-09 01:25:37.482848182 -0600 +++ linux/drivers/md/md.h 2012-07-09 01:27:43.202847084 -0600 @@ -543,12 +543,13 @@ static inline void sysfs_unlink_rdev(str list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *thread_data; }; #define THREAD_WAKEUP 0 @@ -587,7 +588,7 @@ static inline void safe_put_page(struct extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -606,7 +607,7 @@ extern void md_super_write(struct mddev extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); Index: linux/drivers/md/multipath.c =================================================================== --- linux.orig/drivers/md/multipath.c 2012-07-09 01:25:37.532848182 -0600 +++ linux/drivers/md/multipath.c 2012-07-09 01:27:43.202847084 -0600 @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd (struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; Index: linux/drivers/md/raid1.c =================================================================== --- linux.orig/drivers/md/raid1.c 2012-07-09 01:25:37.512848182 -0600 +++ linux/drivers/md/raid1.c 2012-07-09 01:27:43.202847084 -0600 @@ -2157,8 +2157,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; Index: linux/drivers/md/raid10.c =================================================================== --- linux.orig/drivers/md/raid10.c 2012-07-09 01:25:37.502848182 -0600 +++ linux/drivers/md/raid10.c 2012-07-09 01:27:43.202847084 -0600 @@ -2648,8 +2648,9 @@ static void handle_write_completed(struc } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html