The patch titled revert git-block has been added to the -mm tree. Its filename is revert-git-block.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/SubmitChecklist when testing your code *** See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find out what to do about this The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/ ------------------------------------------------------ Subject: revert git-block From: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> too much breakage for me. Cc: Jens Axboe <jens.axboe@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/Kconfig | 3 arch/alpha/Kconfig | 1 arch/alpha/kernel/core_marvel.c | 6 arch/alpha/kernel/smp.c | 170 + arch/arm/Kconfig | 1 arch/arm/kernel/smp.c | 157 - arch/ia64/Kconfig | 1 arch/ia64/kernel/smp.c | 217 + arch/m32r/Kconfig | 1 arch/m32r/kernel/m32r_ksyms.c | 3 arch/m32r/kernel/smp.c | 128 arch/m32r/kernel/traps.c | 3 arch/mips/Kconfig | 1 arch/mips/kernel/smp.c | 139 arch/mips/kernel/smtc.c | 1 arch/parisc/Kconfig | 1 arch/parisc/kernel/smp.c | 134 arch/powerpc/Kconfig | 1 arch/powerpc/kernel/smp.c | 220 + arch/powerpc/platforms/cell/interrupt.c | 1 arch/powerpc/platforms/ps3/smp.c | 7 arch/powerpc/platforms/pseries/xics.c | 6 arch/powerpc/sysdev/mpic.c | 2 arch/sh/Kconfig | 1 arch/sh/kernel/smp.c | 48 arch/x86/Kconfig | 1 arch/x86/kernel/apic_32.c | 4 arch/x86/kernel/entry_64.S | 3 arch/x86/kernel/i8259_64.c | 4 arch/x86/kernel/smp.c | 152 - arch/x86/kernel/smpcommon.c | 56 arch/x86/mach-voyager/voyager_smp.c | 94 arch/x86/xen/enlighten.c | 4 arch/x86/xen/mmu.c | 2 arch/x86/xen/smp.c | 133 arch/x86/xen/xen-ops.h | 9 block/Kconfig.iosched | 12 block/Makefile | 4 block/as-iosched.c | 6 block/bfq-iosched.c | 2742 ------------------- block/blk-core.c | 167 - block/blk-ioc.c | 23 block/blk-settings.c | 38 block/blk-softirq.c | 173 - block/blk-sysfs.c | 98 block/blk.h | 12 block/cfq-iosched.c | 2 fs/ioprio.c | 2 fs/splice.c | 341 -- include/asm-alpha/smp.h | 2 include/asm-ia64/smp.h | 3 include/asm-m32r/smp.h | 1 include/asm-mips/smp.h | 10 include/asm-powerpc/smp.h | 5 include/asm-sh/smp.h | 12 include/asm-x86/hw_irq_32.h | 1 include/asm-x86/hw_irq_64.h | 2 include/asm-x86/mach-default/entry_arch.h | 1 include/asm-x86/mach-default/irq_vectors.h | 1 include/asm-x86/mach-voyager/entry_arch.h | 2 include/asm-x86/mach-voyager/irq_vectors.h | 4 include/asm-x86/smp.h | 19 include/asm-x86/xen/events.h | 1 include/linux/bio.h | 1 include/linux/blkdev.h | 14 include/linux/elevator.h | 8 include/linux/iocontext.h | 4 include/linux/smp.h | 30 include/linux/workqueue.h | 2 init/main.c | 2 kernel/Makefile | 1 kernel/smp.c | 362 -- kernel/workqueue.c | 31 73 files changed, 1797 insertions(+), 4057 deletions(-) diff -puN arch/Kconfig~revert-git-block arch/Kconfig --- a/arch/Kconfig~revert-git-block +++ a/arch/Kconfig @@ -39,6 +39,3 @@ config HAVE_KRETPROBES config HAVE_DMA_ATTRS def_bool n - -config USE_GENERIC_SMP_HELPERS - def_bool n diff -puN arch/alpha/Kconfig~revert-git-block arch/alpha/Kconfig --- a/arch/alpha/Kconfig~revert-git-block +++ a/arch/alpha/Kconfig @@ -528,7 +528,6 @@ config ARCH_MAY_HAVE_PC_FDC config SMP bool "Symmetric multi-processing support" depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/alpha/kernel/core_marvel.c~revert-git-block arch/alpha/kernel/core_marvel.c --- a/arch/alpha/kernel/core_marvel.c~revert-git-block +++ a/arch/alpha/kernel/core_marvel.c @@ -660,9 +660,9 @@ __marvel_rtc_io(u8 b, unsigned long addr #ifdef CONFIG_SMP if (smp_processor_id() != boot_cpuid) - smp_call_function_single(boot_cpuid, - __marvel_access_rtc, - &rtc_access, 1, 1); + smp_call_function_on_cpu(__marvel_access_rtc, + &rtc_access, 1, 1, + cpumask_of_cpu(boot_cpuid)); else __marvel_access_rtc(&rtc_access); #else diff -puN arch/alpha/kernel/smp.c~revert-git-block arch/alpha/kernel/smp.c --- a/arch/alpha/kernel/smp.c~revert-git-block +++ a/arch/alpha/kernel/smp.c @@ -62,7 +62,6 @@ static struct { enum ipi_message_type { IPI_RESCHEDULE, IPI_CALL_FUNC, - IPI_CALL_FUNC_SINGLE, IPI_CPU_STOP, }; @@ -559,6 +558,51 @@ send_ipi_message(cpumask_t to_whom, enum wripir(i); } +/* Structure and data for smp_call_function. This is designed to + minimize static memory requirements. Plus it looks cleaner. */ + +struct smp_call_struct { + void (*func) (void *info); + void *info; + long wait; + atomic_t unstarted_count; + atomic_t unfinished_count; +}; + +static struct smp_call_struct *smp_call_function_data; + +/* Atomicly drop data into a shared pointer. The pointer is free if + it is initially locked. If retry, spin until free. */ + +static int +pointer_lock (void *lock, void *data, int retry) +{ + void *old, *tmp; + + mb(); + again: + /* Compare and swap with zero. */ + asm volatile ( + "1: ldq_l %0,%1\n" + " mov %3,%2\n" + " bne %0,2f\n" + " stq_c %2,%1\n" + " beq %2,1b\n" + "2:" + : "=&r"(old), "=m"(*(void **)lock), "=&r"(tmp) + : "r"(data) + : "memory"); + + if (old == 0) + return 0; + if (! retry) + return -EBUSY; + + while (*(void **)lock) + barrier(); + goto again; +} + void handle_ipi(struct pt_regs *regs) { @@ -588,12 +632,31 @@ handle_ipi(struct pt_regs *regs) break; case IPI_CALL_FUNC: - generic_smp_call_function_interrupt(); - break; - - case IPI_CALL_FUNC_SINGLE: - generic_smp_call_function_single_interrupt(); + { + struct smp_call_struct *data; + void (*func)(void *info); + void *info; + int wait; + + data = smp_call_function_data; + func = data->func; + info = data->info; + wait = data->wait; + + /* Notify the sending CPU that the data has been + received, and execution is about to begin. */ + mb(); + atomic_dec (&data->unstarted_count); + + /* At this point the structure may be gone unless + wait is true. */ + (*func)(info); + + /* Notify the sending CPU that the task is done. */ + mb(); + if (wait) atomic_dec (&data->unfinished_count); break; + } case IPI_CPU_STOP: halt(); @@ -637,15 +700,102 @@ smp_send_stop(void) send_ipi_message(to_whom, IPI_CPU_STOP); } -void arch_send_call_function_ipi(cpumask_t mask) +/* + * Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <retry> If true, keep retrying until ready. + * <wait> If true, wait until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until remote CPUs are nearly ready to execute <func> + * or are or have executed. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ + +int +smp_call_function_on_cpu (void (*func) (void *info), void *info, int retry, + int wait, cpumask_t to_whom) { - send_ipi_message(mask, IPI_CALL_FUNC); + struct smp_call_struct data; + unsigned long timeout; + int num_cpus_to_call; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + data.wait = wait; + + cpu_clear(smp_processor_id(), to_whom); + num_cpus_to_call = cpus_weight(to_whom); + + atomic_set(&data.unstarted_count, num_cpus_to_call); + atomic_set(&data.unfinished_count, num_cpus_to_call); + + /* Acquire the smp_call_function_data mutex. */ + if (pointer_lock(&smp_call_function_data, &data, retry)) + return -EBUSY; + + /* Send a message to the requested CPUs. */ + send_ipi_message(to_whom, IPI_CALL_FUNC); + + /* Wait for a minimal response. */ + timeout = jiffies + HZ; + while (atomic_read (&data.unstarted_count) > 0 + && time_before (jiffies, timeout)) + barrier(); + + /* If there's no response yet, log a message but allow a longer + * timeout period -- if we get a response this time, log + * a message saying when we got it.. + */ + if (atomic_read(&data.unstarted_count) > 0) { + long start_time = jiffies; + printk(KERN_ERR "%s: initial timeout -- trying long wait\n", + __func__); + timeout = jiffies + 30 * HZ; + while (atomic_read(&data.unstarted_count) > 0 + && time_before(jiffies, timeout)) + barrier(); + if (atomic_read(&data.unstarted_count) <= 0) { + long delta = jiffies - start_time; + printk(KERN_ERR + "%s: response %ld.%ld seconds into long wait\n", + __func__, delta / HZ, + (100 * (delta - ((delta / HZ) * HZ))) / HZ); + } + } + + /* We either got one or timed out -- clear the lock. */ + mb(); + smp_call_function_data = NULL; + + /* + * If after both the initial and long timeout periods we still don't + * have a response, something is very wrong... + */ + BUG_ON(atomic_read (&data.unstarted_count) > 0); + + /* Wait for a complete response, if needed. */ + if (wait) { + while (atomic_read (&data.unfinished_count) > 0) + barrier(); + } + + return 0; } +EXPORT_SYMBOL(smp_call_function_on_cpu); -void arch_send_call_function_single_ipi(int cpu) +int +smp_call_function (void (*func) (void *info), void *info, int retry, int wait) { - send_ipi_message(cpumask_of_cpu(cpu), IPI_CALL_FUNC_SINGLE); + return smp_call_function_on_cpu (func, info, retry, wait, + cpu_online_map); } +EXPORT_SYMBOL(smp_call_function); static void ipi_imb(void *ignored) diff -puN arch/arm/Kconfig~revert-git-block arch/arm/Kconfig --- a/arch/arm/Kconfig~revert-git-block +++ a/arch/arm/Kconfig @@ -646,7 +646,6 @@ source "kernel/time/Kconfig" config SMP bool "Symmetric Multi-Processing (EXPERIMENTAL)" depends on EXPERIMENTAL && (REALVIEW_EB_ARM11MP || MACH_REALVIEW_PB11MP) - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/arm/kernel/smp.c~revert-git-block arch/arm/kernel/smp.c --- a/arch/arm/kernel/smp.c~revert-git-block +++ a/arch/arm/kernel/smp.c @@ -68,10 +68,20 @@ enum ipi_msg_type { IPI_TIMER, IPI_RESCHEDULE, IPI_CALL_FUNC, - IPI_CALL_FUNC_SINGLE, IPI_CPU_STOP, }; +struct smp_call_struct { + void (*func)(void *info); + void *info; + int wait; + cpumask_t pending; + cpumask_t unfinished; +}; + +static struct smp_call_struct * volatile smp_call_function_data; +static DEFINE_SPINLOCK(smp_call_function_lock); + int __cpuinit __cpu_up(unsigned int cpu) { struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu); @@ -356,15 +366,114 @@ static void send_ipi_message(cpumask_t c local_irq_restore(flags); } -void arch_send_call_function_ipi(cpumask_t mask) +/* + * You must not call this function with disabled interrupts, from a + * hardware interrupt handler, nor from a bottom half handler. + */ +static int smp_call_function_on_cpu(void (*func)(void *info), void *info, + int retry, int wait, cpumask_t callmap) +{ + struct smp_call_struct data; + unsigned long timeout; + int ret = 0; + + data.func = func; + data.info = info; + data.wait = wait; + + cpu_clear(smp_processor_id(), callmap); + if (cpus_empty(callmap)) + goto out; + + data.pending = callmap; + if (wait) + data.unfinished = callmap; + + /* + * try to get the mutex on smp_call_function_data + */ + spin_lock(&smp_call_function_lock); + smp_call_function_data = &data; + + send_ipi_message(callmap, IPI_CALL_FUNC); + + timeout = jiffies + HZ; + while (!cpus_empty(data.pending) && time_before(jiffies, timeout)) + barrier(); + + /* + * did we time out? + */ + if (!cpus_empty(data.pending)) { + /* + * this may be causing our panic - report it + */ + printk(KERN_CRIT + "CPU%u: smp_call_function timeout for %p(%p)\n" + " callmap %lx pending %lx, %swait\n", + smp_processor_id(), func, info, *cpus_addr(callmap), + *cpus_addr(data.pending), wait ? "" : "no "); + + /* + * TRACE + */ + timeout = jiffies + (5 * HZ); + while (!cpus_empty(data.pending) && time_before(jiffies, timeout)) + barrier(); + + if (cpus_empty(data.pending)) + printk(KERN_CRIT " RESOLVED\n"); + else + printk(KERN_CRIT " STILL STUCK\n"); + } + + /* + * whatever happened, we're done with the data, so release it + */ + smp_call_function_data = NULL; + spin_unlock(&smp_call_function_lock); + + if (!cpus_empty(data.pending)) { + ret = -ETIMEDOUT; + goto out; + } + + if (wait) + while (!cpus_empty(data.unfinished)) + barrier(); + out: + + return 0; +} + +int smp_call_function(void (*func)(void *info), void *info, int retry, + int wait) { - send_ipi_message(mask, IPI_CALL_FUNC); + return smp_call_function_on_cpu(func, info, retry, wait, + cpu_online_map); } +EXPORT_SYMBOL_GPL(smp_call_function); -void arch_send_call_function_single_ipi(int cpu) +int smp_call_function_single(int cpu, void (*func)(void *info), void *info, + int retry, int wait) { - send_ipi_message(cpumask_of_cpu(cpu), IPI_CALL_FUNC_SINGLE); + /* prevent preemption and reschedule on another processor */ + int current_cpu = get_cpu(); + int ret = 0; + + if (cpu == current_cpu) { + local_irq_disable(); + func(info); + local_irq_enable(); + } else + ret = smp_call_function_on_cpu(func, info, retry, wait, + cpumask_of_cpu(cpu)); + + put_cpu(); + + return ret; } +EXPORT_SYMBOL_GPL(smp_call_function_single); void show_ipi_list(struct seq_file *p) { @@ -412,6 +521,27 @@ asmlinkage void __exception do_local_tim } #endif +/* + * ipi_call_function - handle IPI from smp_call_function() + * + * Note that we copy data out of the cross-call structure and then + * let the caller know that we're here and have done with their data + */ +static void ipi_call_function(unsigned int cpu) +{ + struct smp_call_struct *data = smp_call_function_data; + void (*func)(void *info) = data->func; + void *info = data->info; + int wait = data->wait; + + cpu_clear(cpu, data->pending); + + func(info); + + if (wait) + cpu_clear(cpu, data->unfinished); +} + static DEFINE_SPINLOCK(stop_lock); /* @@ -481,11 +611,7 @@ asmlinkage void __exception do_IPI(struc break; case IPI_CALL_FUNC: - generic_smp_call_function_interrupt(); - break; - - case IPI_CALL_FUNC_SINGLE: - generic_smp_call_function_single_interrupt(); + ipi_call_function(cpu); break; case IPI_CPU_STOP: @@ -536,13 +662,14 @@ int setup_profiling_timer(unsigned int m } static int -on_each_cpu_mask(void (*func)(void *), void *info, int wait, cpumask_t mask) +on_each_cpu_mask(void (*func)(void *), void *info, int retry, int wait, + cpumask_t mask) { int ret = 0; preempt_disable(); - ret = smp_call_function_mask(mask, func, info, wait); + ret = smp_call_function_on_cpu(func, info, retry, wait, mask); if (cpu_isset(smp_processor_id(), mask)) func(info); @@ -611,7 +738,7 @@ void flush_tlb_mm(struct mm_struct *mm) { cpumask_t mask = mm->cpu_vm_mask; - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mask); + on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, 1, mask); } void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) @@ -622,7 +749,7 @@ void flush_tlb_page(struct vm_area_struc ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mask); + on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, 1, mask); } void flush_tlb_kernel_page(unsigned long kaddr) @@ -644,7 +771,7 @@ void flush_tlb_range(struct vm_area_stru ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mask); + on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, 1, mask); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff -puN arch/ia64/Kconfig~revert-git-block arch/ia64/Kconfig --- a/arch/ia64/Kconfig~revert-git-block +++ a/arch/ia64/Kconfig @@ -290,7 +290,6 @@ config VIRT_CPU_ACCOUNTING config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more diff -puN arch/ia64/kernel/smp.c~revert-git-block arch/ia64/kernel/smp.c --- a/arch/ia64/kernel/smp.c~revert-git-block +++ a/arch/ia64/kernel/smp.c @@ -60,9 +60,25 @@ static struct local_tlb_flush_counts { static DEFINE_PER_CPU(unsigned int, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; + +/* + * Structure and data for smp_call_function(). This is designed to minimise static memory + * requirements. It also looks cleaner. + */ +static __cacheline_aligned DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + long wait; + atomic_t started; + atomic_t finished; +}; + +static volatile struct call_data_struct *call_data; + #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 -#define IPI_CALL_FUNC_SINGLE 2 #define IPI_KDUMP_CPU_STOP 3 /* This needs to be cacheline aligned because it is written to by *other* CPUs. */ @@ -73,13 +89,13 @@ extern void cpu_halt (void); void lock_ipi_calllock(void) { - spin_lock_irq(&call_function_lock); + spin_lock_irq(&call_lock); } void unlock_ipi_calllock(void) { - spin_unlock_irq(&call_function_lock); + spin_unlock_irq(&call_lock); } static inline void @@ -147,14 +163,12 @@ handle_IPI (int irq, void *dev_id) ops &= ~(1 << which); switch (which) { - case IPI_CPU_STOP: - stop_this_cpu(); - break; case IPI_CALL_FUNC: - generic_smp_call_function_interrupt(); + handle_call_data(); break; - case IPI_CALL_FUNC_SINGLE: - generic_smp_call_function_single_interrupt(); + + case IPI_CPU_STOP: + stop_this_cpu(); break; #ifdef CONFIG_KEXEC case IPI_KDUMP_CPU_STOP: @@ -173,8 +187,6 @@ handle_IPI (int irq, void *dev_id) return IRQ_HANDLED; } - - /* * Called with preemption disabled. */ @@ -348,15 +360,190 @@ smp_flush_tlb_mm (struct mm_struct *mm) on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1); } -void arch_send_call_function_single_ipi(int cpu) +/* + * Run a function on a specific CPU + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> Currently unused. + * <wait> If true, wait until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int +smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic, + int wait) { - send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); + struct call_data_struct data; + int cpus = 1; + int me = get_cpu(); /* prevent preemption and reschedule on another processor */ + + if (cpuid == me) { + local_irq_disable(); + func(info); + local_irq_enable(); + put_cpu(); + return 0; + } + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock_bh(&call_lock); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ + send_IPI_single(cpuid, IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; } +EXPORT_SYMBOL(smp_call_function_single); -void arch_send_call_function_ipi(cpumask_t mask) +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * <mask> The set of cpus to run on. Must not include the current cpu. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <wait> If true, wait (atomically) until function + * has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + struct call_data_struct data; + cpumask_t allbutself; + int cpus; + + spin_lock(&call_lock); + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC*/ + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(IPI_CALL_FUNC); + else + send_IPI_mask(mask, IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock(&call_lock); + return 0; + +} +EXPORT_SYMBOL(smp_call_function_mask); + +/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ + +/* + * [SUMMARY] Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> currently unused. + * <wait> If true, wait (atomically) until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until remote CPUs are nearly ready to execute <func> or are or have + * executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int +smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) { - send_IPI_mask(mask, IPI_CALL_FUNC); + struct call_data_struct data; + int cpus; + + spin_lock(&call_lock); + cpus = num_online_cpus() - 1; + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ + send_IPI_allbutself(IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock(&call_lock); + return 0; } +EXPORT_SYMBOL(smp_call_function); /* * this function calls the 'stop' function on all other CPUs in the system. diff -puN arch/m32r/Kconfig~revert-git-block arch/m32r/Kconfig --- a/arch/m32r/Kconfig~revert-git-block +++ a/arch/m32r/Kconfig @@ -296,7 +296,6 @@ config PREEMPT config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/m32r/kernel/m32r_ksyms.c~revert-git-block arch/m32r/kernel/m32r_ksyms.c --- a/arch/m32r/kernel/m32r_ksyms.c~revert-git-block +++ a/arch/m32r/kernel/m32r_ksyms.c @@ -43,6 +43,9 @@ EXPORT_SYMBOL(dcache_dummy); #endif EXPORT_SYMBOL(cpu_data); +/* Global SMP stuff */ +EXPORT_SYMBOL(smp_call_function); + /* TLB flushing */ EXPORT_SYMBOL(smp_flush_tlb_page); #endif diff -puN arch/m32r/kernel/smp.c~revert-git-block arch/m32r/kernel/smp.c --- a/arch/m32r/kernel/smp.c~revert-git-block +++ a/arch/m32r/kernel/smp.c @@ -35,6 +35,22 @@ /*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*/ /* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + +static struct call_data_struct *call_data; + +/* * For flush_cache_all() */ static DEFINE_SPINLOCK(flushcache_lock); @@ -80,6 +96,9 @@ void smp_invalidate_interrupt(void); void smp_send_stop(void); static void stop_this_cpu(void *); +int smp_call_function(void (*) (void *), void *, int, int); +void smp_call_function_interrupt(void); + void smp_send_timer(void); void smp_ipi_timer_interrupt(struct pt_regs *); void smp_local_timer_interrupt(void); @@ -546,14 +565,86 @@ static void stop_this_cpu(void *dummy) for ( ; ; ); } -void arch_send_call_function_ipi(cpumask_t mask) -{ - send_IPI_mask(mask, CALL_FUNCTION_IPI, 0); -} +/*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*/ +/* Call function Routines */ +/*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*/ -void arch_send_call_function_single_ipi(int cpu) +/*==========================================================================* + * Name: smp_call_function + * + * Description: This routine sends a 'CALL_FUNCTION_IPI' to all other CPUs + * in the system. + * + * Born on Date: 2002.02.05 + * + * Arguments: *func - The function to run. This must be fast and + * non-blocking. + * *info - An arbitrary pointer to pass to the function. + * nonatomic - currently unused. + * wait - If true, wait (atomically) until function has + * completed on other CPUs. + * + * Returns: 0 on success, else a negative status code. Does not return + * until remote CPUs are nearly ready to execute <<func>> or + * are or have executed. + * + * Cautions: You must not call this function with disabled interrupts or + * from a hardware interrupt handler, you may call it from a + * bottom half handler. + * + * Modification log: + * Date Who Description + * ---------- --- -------------------------------------------------------- + * + *==========================================================================*/ +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_IPI, 0); + struct call_data_struct data; + int cpus; + +#ifdef DEBUG_SMP + unsigned long flags; + __save_flags(flags); + if (!(flags & 0x0040)) /* Interrupt Disable NONONO */ + BUG(); +#endif /* DEBUG_SMP */ + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus() - 1; + + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_IPI, 0); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + barrier(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&call_lock); + + return 0; } /*==========================================================================* @@ -575,16 +666,27 @@ void arch_send_call_function_single_ipi( *==========================================================================*/ void smp_call_function_interrupt(void) { - irq_enter(); - generic_smp_call_function_interrupt(); - irq_exit(); -} + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; -void smp_call_function_single_interrupt(void) -{ + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ irq_enter(); - generic_smp_call_function_single_interrupt(); + (*func)(info); irq_exit(); + + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } } /*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*/ diff -puN arch/m32r/kernel/traps.c~revert-git-block arch/m32r/kernel/traps.c --- a/arch/m32r/kernel/traps.c~revert-git-block +++ a/arch/m32r/kernel/traps.c @@ -40,7 +40,6 @@ extern void smp_invalidate_interrupt(voi extern void smp_call_function_interrupt(void); extern void smp_ipi_timer_interrupt(void); extern void smp_flush_cache_all_interrupt(void); -extern void smp_call_function_single_interrupt(void); /* * for Boot AP function @@ -104,7 +103,7 @@ void set_eit_vector_entries(void) eit_vector[186] = (unsigned long)smp_call_function_interrupt; eit_vector[187] = (unsigned long)smp_ipi_timer_interrupt; eit_vector[188] = (unsigned long)smp_flush_cache_all_interrupt; - eit_vector[189] = (unsigned long)smp_call_function_single_interrupt; + eit_vector[189] = 0; eit_vector[190] = 0; eit_vector[191] = 0; #endif diff -puN arch/mips/Kconfig~revert-git-block arch/mips/Kconfig --- a/arch/mips/Kconfig~revert-git-block +++ a/arch/mips/Kconfig @@ -1763,7 +1763,6 @@ config SMP bool "Multi-Processing support" depends on SYS_SUPPORTS_SMP select IRQ_PER_CPU - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/mips/kernel/smp.c~revert-git-block arch/mips/kernel/smp.c --- a/arch/mips/kernel/smp.c~revert-git-block +++ a/arch/mips/kernel/smp.c @@ -131,28 +131,145 @@ asmlinkage __cpuinit void start_secondar cpu_idle(); } -void arch_send_call_function_ipi(cpumask_t mask) +DEFINE_SPINLOCK(smp_call_lock); + +struct call_data_struct *call_data; + +/* + * Run a function on all other CPUs. + * + * <mask> cpuset_t of all processors to run the function on. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <retry> If true, keep retrying until ready. + * <wait> If true, wait until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until remote CPUs are nearly ready to execute <func> + * or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler: + * + * CPU A CPU B + * Disable interrupts + * smp_call_function() + * Take call_lock + * Send IPIs + * Wait for all cpus to acknowledge IPI + * CPU A has not responded, spin waiting + * for cpu A to respond, holding call_lock + * smp_call_function() + * Spin waiting for call_lock + * Deadlock Deadlock + */ +int smp_call_function_mask(cpumask_t mask, void (*func) (void *info), + void *info, int retry, int wait) { + struct call_data_struct data; + int cpu = smp_processor_id(); + int cpus; + + /* + * Can die spectacularly if this CPU isn't yet marked online + */ + BUG_ON(!cpu_online(cpu)); + + cpu_clear(cpu, mask); + cpus = cpus_weight(mask); + if (!cpus) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock(&smp_call_lock); + call_data = &data; + smp_mb(); + + /* Send a message to all other CPUs and wait for them to respond */ mp_ops->send_ipi_mask(mask, SMP_CALL_FUNCTION); + + /* Wait for response */ + /* FIXME: lock-up detection, backtrace on lock-up */ + while (atomic_read(&data.started) != cpus) + barrier(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + call_data = NULL; + spin_unlock(&smp_call_lock); + + return 0; } -/* - * We reuse the same vector for the single IPI - */ -void arch_send_call_function_single_ipi(int cpu) +int smp_call_function(void (*func) (void *info), void *info, int retry, + int wait) { - mp_ops->send_ipi_mask(cpumask_of_cpu(cpu), SMP_CALL_FUNCTION); + return smp_call_function_mask(cpu_online_map, func, info, retry, wait); } -/* - * Call into both interrupt handlers, as we share the IPI for them - */ void smp_call_function_interrupt(void) { + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function. + */ + smp_mb(); + atomic_inc(&call_data->started); + + /* + * At this point the info structure may be out of scope unless wait==1. + */ irq_enter(); - generic_smp_call_function_single_interrupt(); - generic_smp_call_function_interrupt(); + (*func)(info); irq_exit(); + + if (wait) { + smp_mb(); + atomic_inc(&call_data->finished); + } +} + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int retry, int wait) +{ + int ret, me; + + /* + * Can die spectacularly if this CPU isn't yet marked online + */ + if (!cpu_online(cpu)) + return 0; + + me = get_cpu(); + BUG_ON(!cpu_online(me)); + + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); + put_cpu(); + return 0; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, retry, + wait); + + put_cpu(); + return 0; } static void stop_this_cpu(void *dummy) diff -puN arch/mips/kernel/smtc.c~revert-git-block arch/mips/kernel/smtc.c --- a/arch/mips/kernel/smtc.c~revert-git-block +++ a/arch/mips/kernel/smtc.c @@ -877,6 +877,7 @@ static void ipi_resched_interrupt(void) /* Return from interrupt should be enough to cause scheduler check */ } + static void ipi_call_interrupt(void) { /* Invoke generic function invocation code in smp.c */ diff -puN arch/parisc/Kconfig~revert-git-block arch/parisc/Kconfig --- a/arch/parisc/Kconfig~revert-git-block +++ a/arch/parisc/Kconfig @@ -199,7 +199,6 @@ endchoice config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/parisc/kernel/smp.c~revert-git-block arch/parisc/kernel/smp.c --- a/arch/parisc/kernel/smp.c~revert-git-block +++ a/arch/parisc/kernel/smp.c @@ -84,11 +84,19 @@ EXPORT_SYMBOL(cpu_possible_map); DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED; +struct smp_call_struct { + void (*func) (void *info); + void *info; + long wait; + atomic_t unstarted_count; + atomic_t unfinished_count; +}; +static volatile struct smp_call_struct *smp_call_function_data; + enum ipi_message_type { IPI_NOP=0, IPI_RESCHEDULE=1, IPI_CALL_FUNC, - IPI_CALL_FUNC_SINGLE, IPI_CPU_START, IPI_CPU_STOP, IPI_CPU_TEST @@ -179,12 +187,33 @@ ipi_interrupt(int irq, void *dev_id) case IPI_CALL_FUNC: smp_debug(100, KERN_DEBUG "CPU%d IPI_CALL_FUNC\n", this_cpu); - generic_smp_call_function_interrupt(); - break; - - case IPI_CALL_FUNC_SINGLE: - smp_debug(100, KERN_DEBUG "CPU%d IPI_CALL_FUNC_SINGLE\n", this_cpu); - generic_smp_call_function_single_interrupt(); + { + volatile struct smp_call_struct *data; + void (*func)(void *info); + void *info; + int wait; + + data = smp_call_function_data; + func = data->func; + info = data->info; + wait = data->wait; + + mb(); + atomic_dec ((atomic_t *)&data->unstarted_count); + + /* At this point, *data can't + * be relied upon. + */ + + (*func)(info); + + /* Notify the sending CPU that the + * task is done. + */ + mb(); + if (wait) + atomic_dec ((atomic_t *)&data->unfinished_count); + } break; case IPI_CPU_START: @@ -227,14 +256,6 @@ ipi_send(int cpu, enum ipi_message_type spin_unlock_irqrestore(lock, flags); } -static void -send_IPI_mask(cpumask_t mask, enum ipi_message_type op) -{ - int cpu; - - for_each_cpu_mask(cpu, mask) - ipi_send(cpu, op); -} static inline void send_IPI_single(int dest_cpu, enum ipi_message_type op) @@ -274,16 +295,87 @@ smp_send_all_nop(void) send_IPI_allbutself(IPI_NOP); } -void arch_send_call_function_ipi(cpumask_t mask) -{ - send_IPI_mask(mask, IPI_CALL_FUNC); -} -void arch_send_call_function_single_ipi(int cpu) +/** + * Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <retry> If true, keep retrying until ready. + * <wait> If true, wait until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until remote CPUs are nearly ready to execute <func> + * or have executed. + */ + +int +smp_call_function (void (*func) (void *info), void *info, int retry, int wait) { - send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); + struct smp_call_struct data; + unsigned long timeout; + static DEFINE_SPINLOCK(lock); + int retries = 0; + + if (num_online_cpus() < 2) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* can also deadlock if IPIs are disabled */ + WARN_ON((get_eiem() & (1UL<<(CPU_IRQ_MAX - IPI_IRQ))) == 0); + + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.unstarted_count, num_online_cpus() - 1); + atomic_set(&data.unfinished_count, num_online_cpus() - 1); + + if (retry) { + spin_lock (&lock); + while (smp_call_function_data != 0) + barrier(); + } + else { + spin_lock (&lock); + if (smp_call_function_data) { + spin_unlock (&lock); + return -EBUSY; + } + } + + smp_call_function_data = &data; + spin_unlock (&lock); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(IPI_CALL_FUNC); + + retry: + /* Wait for response */ + timeout = jiffies + HZ; + while ( (atomic_read (&data.unstarted_count) > 0) && + time_before (jiffies, timeout) ) + barrier (); + + if (atomic_read (&data.unstarted_count) > 0) { + printk(KERN_CRIT "SMP CALL FUNCTION TIMED OUT! (cpu=%d), try %d\n", + smp_processor_id(), ++retries); + goto retry; + } + /* We either got one or timed out. Release the lock */ + + mb(); + smp_call_function_data = NULL; + + while (wait && atomic_read (&data.unfinished_count) > 0) + barrier (); + + return 0; } +EXPORT_SYMBOL(smp_call_function); + /* * Flush all other CPU's tlb and then mine. Do this with on_each_cpu() * as we want to ensure all TLB's flushed before proceeding. diff -puN arch/powerpc/Kconfig~revert-git-block arch/powerpc/Kconfig --- a/arch/powerpc/Kconfig~revert-git-block +++ a/arch/powerpc/Kconfig @@ -110,7 +110,6 @@ config PPC select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_LMB - select USE_GENERIC_SMP_HELPERS if SMP config EARLY_PRINTK bool diff -puN arch/powerpc/kernel/smp.c~revert-git-block arch/powerpc/kernel/smp.c --- a/arch/powerpc/kernel/smp.c~revert-git-block +++ a/arch/powerpc/kernel/smp.c @@ -72,8 +72,12 @@ struct smp_ops_t *smp_ops; static volatile unsigned int cpu_callin_map[NR_CPUS]; +void smp_call_function_interrupt(void); + int smt_enabled_at_boot = 1; +static int ipi_fail_ok; + static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; #ifdef CONFIG_PPC64 @@ -95,15 +99,12 @@ void smp_message_recv(int msg) { switch(msg) { case PPC_MSG_CALL_FUNCTION: - generic_smp_call_function_interrupt(); + smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: /* XXX Do we have to do this? */ set_need_resched(); break; - case PPC_MSG_CALL_FUNC_SINGLE: - generic_smp_call_function_single_interrupt(); - break; case PPC_MSG_DEBUGGER_BREAK: if (crash_ipi_function_ptr) { crash_ipi_function_ptr(get_irq_regs()); @@ -153,22 +154,215 @@ static void stop_this_cpu(void *dummy) ; } -void arch_send_call_function_single_ipi(int cpu) +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + * Stolen from the i386 version. + */ +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); + +static struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +} *call_data; + +/* delay of at least 8 seconds */ +#define SMP_CALL_TIMEOUT 8 + +/* + * These functions send a 'generic call function' IPI to other online + * CPUS in the system. + * + * [SUMMARY] Run a function on other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> currently unused. + * <wait> If true, wait (atomically) until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * <map> is a cpu map of the cpus to send IPI to. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +static int __smp_call_function_map(void (*func) (void *info), void *info, + int nonatomic, int wait, cpumask_t map) +{ + struct call_data_struct data; + int ret = -1, num_cpus; + int cpu; + u64 timeout; + + if (unlikely(smp_ops == NULL)) + return ret; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + /* remove 'self' from the map */ + if (cpu_isset(smp_processor_id(), map)) + cpu_clear(smp_processor_id(), map); + + /* sanity check the map, remove any non-online processors. */ + cpus_and(map, map, cpu_online_map); + + num_cpus = cpus_weight(map); + if (!num_cpus) + goto done; + + call_data = &data; + smp_wmb(); + /* Send a message to all CPUs in the map */ + for_each_cpu_mask(cpu, map) + smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNCTION); + + timeout = get_tb() + (u64) SMP_CALL_TIMEOUT * tb_ticks_per_sec; + + /* Wait for indication that they have received the message */ + while (atomic_read(&data.started) != num_cpus) { + HMT_low(); + if (get_tb() >= timeout) { + printk("smp_call_function on cpu %d: other cpus not " + "responding (%d)\n", smp_processor_id(), + atomic_read(&data.started)); + if (!ipi_fail_ok) + debugger(NULL); + goto out; + } + } + + /* optionally wait for the CPUs to complete */ + if (wait) { + while (atomic_read(&data.finished) != num_cpus) { + HMT_low(); + if (get_tb() >= timeout) { + printk("smp_call_function on cpu %d: other " + "cpus not finishing (%d/%d)\n", + smp_processor_id(), + atomic_read(&data.finished), + atomic_read(&data.started)); + debugger(NULL); + goto out; + } + } + } + + done: + ret = 0; + + out: + call_data = NULL; + HMT_medium(); + return ret; +} + +static int __smp_call_function(void (*func)(void *info), void *info, + int nonatomic, int wait) { - smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE); + int ret; + spin_lock(&call_lock); + ret =__smp_call_function_map(func, info, nonatomic, wait, + cpu_online_map); + spin_unlock(&call_lock); + return ret; } -void arch_send_call_function_ipi(cpumask_t mask) +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) { - unsigned int cpu; + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); - for_each_cpu_mask(cpu, mask) - smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNCTION); + return __smp_call_function(func, info, nonatomic, wait); } +EXPORT_SYMBOL(smp_call_function); + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + cpumask_t map = CPU_MASK_NONE; + int ret = 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + if (!cpu_online(cpu)) + return -EINVAL; + + cpu_set(cpu, map); + if (cpu != get_cpu()) { + spin_lock(&call_lock); + ret = __smp_call_function_map(func, info, nonatomic, wait, map); + spin_unlock(&call_lock); + } else { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); void smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 0, 0); + int nolock; + + /* It's OK to fail sending the IPI, since the alternative is to + * be stuck forever waiting on the other CPU to take the interrupt. + * + * It's better to at least continue and go through reboot, since this + * function is usually called at panic or reboot time in the first + * place. + */ + ipi_fail_ok = 1; + + /* Don't deadlock in case we got called through panic */ + nolock = !spin_trylock(&call_lock); + __smp_call_function_map(stop_this_cpu, NULL, 1, 0, cpu_online_map); + if (!nolock) + spin_unlock(&call_lock); +} + +void smp_call_function_interrupt(void) +{ + void (*func) (void *info); + void *info; + int wait; + + /* call_data will be NULL if the sender timed out while + * waiting on us to receive the call. + */ + if (!call_data) + return; + + func = call_data->func; + info = call_data->info; + wait = call_data->wait; + + if (!wait) + smp_mb__before_atomic_inc(); + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + (*func)(info); + if (wait) { + smp_mb__before_atomic_inc(); + atomic_inc(&call_data->finished); + } } extern struct gettimeofday_struct do_gtod; @@ -402,9 +596,9 @@ int __devinit start_secondary(void *unus secondary_cpu_time_init(); - spin_lock(&call_function_lock); + spin_lock(&call_lock); cpu_set(cpu, cpu_online_map); - spin_unlock(&call_function_lock); + spin_unlock(&call_lock); local_irq_enable(); diff -puN arch/powerpc/platforms/cell/interrupt.c~revert-git-block arch/powerpc/platforms/cell/interrupt.c --- a/arch/powerpc/platforms/cell/interrupt.c~revert-git-block +++ a/arch/powerpc/platforms/cell/interrupt.c @@ -218,7 +218,6 @@ void iic_request_IPIs(void) { iic_request_ipi(PPC_MSG_CALL_FUNCTION, "IPI-call"); iic_request_ipi(PPC_MSG_RESCHEDULE, "IPI-resched"); - iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE, "IPI-call-single"); #ifdef CONFIG_DEBUGGER iic_request_ipi(PPC_MSG_DEBUGGER_BREAK, "IPI-debug"); #endif /* CONFIG_DEBUGGER */ diff -puN arch/powerpc/platforms/ps3/smp.c~revert-git-block arch/powerpc/platforms/ps3/smp.c --- a/arch/powerpc/platforms/ps3/smp.c~revert-git-block +++ a/arch/powerpc/platforms/ps3/smp.c @@ -105,10 +105,9 @@ static void __init ps3_smp_setup_cpu(int * to index needs to be setup. */ - BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0); - BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1); - BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2); - BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3); + BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0); + BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1); + BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3); for (i = 0; i < MSG_COUNT; i++) { result = ps3_event_receive_port_setup(cpu, &virqs[i]); diff -puN arch/powerpc/platforms/pseries/xics.c~revert-git-block arch/powerpc/platforms/pseries/xics.c --- a/arch/powerpc/platforms/pseries/xics.c~revert-git-block +++ a/arch/powerpc/platforms/pseries/xics.c @@ -383,11 +383,13 @@ static irqreturn_t xics_ipi_dispatch(int mb(); smp_message_recv(PPC_MSG_RESCHEDULE); } - if (test_and_clear_bit(PPC_MSG_CALL_FUNC_SINGLE, +#if 0 + if (test_and_clear_bit(PPC_MSG_MIGRATE_TASK, &xics_ipi_message[cpu].value)) { mb(); - smp_message_recv(PPC_MSG_CALL_FUNC_SINGLE); + smp_message_recv(PPC_MSG_MIGRATE_TASK); } +#endif #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK, &xics_ipi_message[cpu].value)) { diff -puN arch/powerpc/sysdev/mpic.c~revert-git-block arch/powerpc/sysdev/mpic.c --- a/arch/powerpc/sysdev/mpic.c~revert-git-block +++ a/arch/powerpc/sysdev/mpic.c @@ -1508,7 +1508,7 @@ void mpic_request_ipis(void) static char *ipi_names[] = { "IPI0 (call function)", "IPI1 (reschedule)", - "IPI2 (call function single)", + "IPI2 (unused)", "IPI3 (debugger break)", }; BUG_ON(mpic == NULL); diff -puN arch/sh/Kconfig~revert-git-block arch/sh/Kconfig --- a/arch/sh/Kconfig~revert-git-block +++ a/arch/sh/Kconfig @@ -689,7 +689,6 @@ config CRASH_DUMP config SMP bool "Symmetric multi-processing support" depends on SYS_SUPPORTS_SMP - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff -puN arch/sh/kernel/smp.c~revert-git-block arch/sh/kernel/smp.c --- a/arch/sh/kernel/smp.c~revert-git-block +++ a/arch/sh/kernel/smp.c @@ -36,6 +36,13 @@ EXPORT_SYMBOL(cpu_possible_map); cpumask_t cpu_online_map; EXPORT_SYMBOL(cpu_online_map); +static atomic_t cpus_booted = ATOMIC_INIT(0); + +/* + * Run specified function on a particular processor. + */ +void __smp_call_function(unsigned int cpu); + static inline void __init smp_store_cpu_info(unsigned int cpu) { struct sh_cpuinfo *c = cpu_data + cpu; @@ -171,17 +178,42 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, 0, 1, 0); } -void arch_send_call_function_ipi(cpumask_t mask) +struct smp_fn_call_struct smp_fn_call = { + .lock = __SPIN_LOCK_UNLOCKED(smp_fn_call.lock), + .finished = ATOMIC_INIT(0), +}; + +/* + * The caller of this wants the passed function to run on every cpu. If wait + * is set, wait until all cpus have finished the function before returning. + * The lock is here to protect the call structure. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func)(void *info), void *info, int retry, int wait) { - int cpu; + unsigned int nr_cpus = atomic_read(&cpus_booted); + int i; - for_each_cpu_mask(cpu, mask) - plat_send_ipi(cpu, SMP_MSG_FUNCTION); -} + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); -void arch_send_call_function_single_ipi(int cpu) -{ - plat_send_ipi(cpu, SMP_MSG_FUNCTION_SINGLE); + spin_lock(&smp_fn_call.lock); + + atomic_set(&smp_fn_call.finished, 0); + smp_fn_call.fn = func; + smp_fn_call.data = info; + + for (i = 0; i < nr_cpus; i++) + if (i != smp_processor_id()) + plat_send_ipi(i, SMP_MSG_FUNCTION); + + if (wait) + while (atomic_read(&smp_fn_call.finished) != (nr_cpus - 1)); + + spin_unlock(&smp_fn_call.lock); + + return 0; } /* Not really SMP stuff ... */ diff -puN arch/x86/Kconfig~revert-git-block arch/x86/Kconfig --- a/arch/x86/Kconfig~revert-git-block +++ a/arch/x86/Kconfig @@ -176,7 +176,6 @@ config GENERIC_PENDING_IRQ config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) - select USE_GENERIC_SMP_HELPERS default y config X86_32_SMP diff -puN arch/x86/kernel/apic_32.c~revert-git-block arch/x86/kernel/apic_32.c --- a/arch/x86/kernel/apic_32.c~revert-git-block +++ a/arch/x86/kernel/apic_32.c @@ -1358,10 +1358,6 @@ void __init smp_intr_init(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); } #endif diff -puN arch/x86/kernel/entry_64.S~revert-git-block arch/x86/kernel/entry_64.S --- a/arch/x86/kernel/entry_64.S~revert-git-block +++ a/arch/x86/kernel/entry_64.S @@ -813,9 +813,6 @@ END(invalidate_interrupt\num) ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt END(call_function_interrupt) -ENTRY(call_function_single_interrupt) - apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt -END(call_function_single_interrupt) ENTRY(irq_move_cleanup_interrupt) apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt END(irq_move_cleanup_interrupt) diff -puN arch/x86/kernel/i8259_64.c~revert-git-block arch/x86/kernel/i8259_64.c --- a/arch/x86/kernel/i8259_64.c~revert-git-block +++ a/arch/x86/kernel/i8259_64.c @@ -494,10 +494,6 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for generic single function call */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif diff -puN arch/x86/kernel/smp.c~revert-git-block arch/x86/kernel/smp.c --- a/arch/x86/kernel/smp.c~revert-git-block +++ a/arch/x86/kernel/smp.c @@ -121,32 +121,131 @@ static void native_smp_send_reschedule(i send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + void lock_ipi_call_lock(void) { - spin_lock_irq(&call_function_lock); + spin_lock_irq(&call_lock); } void unlock_ipi_call_lock(void) { - spin_unlock_irq(&call_function_lock); + spin_unlock_irq(&call_lock); } -void native_send_call_func_single_ipi(int cpu) +static struct call_data_struct *call_data; + +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + struct call_data_struct data; + int cpus = num_online_cpus() - 1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); } -void native_send_call_func_ipi(cpumask_t mask) + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +static int +native_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { + struct call_data_struct data; cpumask_t allbutself; + int cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + + /* Send a message to other CPUs */ if (cpus_equal(mask, allbutself)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + spin_unlock(&call_lock); + + return 0; } static void stop_this_cpu(void *dummy) @@ -168,13 +267,18 @@ static void stop_this_cpu(void *dummy) static void native_smp_send_stop(void) { + int nolock; unsigned long flags; if (reboot_force) return; - smp_call_function(stop_this_cpu, NULL, 0, 0); + /* Don't deadlock on the call lock in panic */ + nolock = !spin_trylock(&call_lock); local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_local_APIC(); local_irq_restore(flags); } @@ -196,28 +300,33 @@ void smp_reschedule_interrupt(struct pt_ void smp_call_function_interrupt(struct pt_regs *regs) { - ack_APIC_irq(); - irq_enter(); - generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif - irq_exit(); -} + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; -void smp_call_function_single_interrupt(void) -{ ack_APIC_irq(); + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ irq_enter(); - generic_smp_call_function_single_interrupt(); + (*func)(info); #ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; #else add_pda(irq_call_count, 1); #endif irq_exit(); + + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } } struct smp_ops smp_ops = { @@ -228,8 +337,7 @@ struct smp_ops smp_ops = { .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, - - .send_call_func_ipi = native_send_call_func_ipi, - .send_call_func_single_ipi = native_send_call_func_single_ipi, + .smp_call_function_mask = native_smp_call_function_mask, }; EXPORT_SYMBOL_GPL(smp_ops); + diff -puN arch/x86/kernel/smpcommon.c~revert-git-block arch/x86/kernel/smpcommon.c --- a/arch/x86/kernel/smpcommon.c~revert-git-block +++ a/arch/x86/kernel/smpcommon.c @@ -25,3 +25,59 @@ __cpuinit void init_gdt(int cpu) per_cpu(cpu_number, cpu) = cpu; } #endif + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + return smp_call_function_mask(cpu_online_map, func, info, wait); +} +EXPORT_SYMBOL(smp_call_function); + +/** + * smp_call_function_single - Run a function on a specific CPU + * @cpu: The target CPU. Cannot be the calling CPU. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int ret; + int me = get_cpu(); + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); + put_cpu(); + return 0; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); diff -puN arch/x86/mach-voyager/voyager_smp.c~revert-git-block arch/x86/mach-voyager/voyager_smp.c --- a/arch/x86/mach-voyager/voyager_smp.c~revert-git-block +++ a/arch/x86/mach-voyager/voyager_smp.c @@ -955,24 +955,94 @@ static void smp_stop_cpu_function(void * halt(); } +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + volatile unsigned long started; + volatile unsigned long finished; + int wait; +}; + +static struct call_data_struct *call_data; + /* execute a thread on a new CPU. The function to be called must be * previously set up. This is used to schedule a function for * execution on all CPUs - set up the function then broadcast a * function_interrupt CPI to come here on each CPU */ static void smp_call_function_interrupt(void) { + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + /* must take copy of wait because call_data may be replaced + * unless the function is waiting for us to finish */ + int wait = call_data->wait; + __u8 cpu = smp_processor_id(); + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + if (!test_and_clear_bit(cpu, &call_data->started)) { + /* If the bit wasn't set, this could be a replay */ + printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion" + " with no call pending\n", cpu); + return; + } + /* + * At this point the info structure may be out of scope unless wait==1 + */ irq_enter(); - generic_smp_call_function_interrupt(); + (*func) (info); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); + if (wait) { + mb(); + clear_bit(cpu, &call_data->finished); + } } -static void smp_call_function_single_interrupt(void) +static int +voyager_smp_call_function_mask(cpumask_t cpumask, + void (*func) (void *info), void *info, int wait) { - irq_enter(); - generic_smp_call_function_single_interrupt(); - __get_cpu_var(irq_stat).irq_call_count++; - irq_exit(); + struct call_data_struct data; + u32 mask = cpus_addr(cpumask)[0]; + + mask &= ~(1 << smp_processor_id()); + + if (!mask) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + data.started = mask; + data.wait = wait; + if (wait) + data.finished = mask; + + spin_lock(&call_lock); + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_CPI(mask, VIC_CALL_FUNCTION_CPI); + + /* Wait for response */ + while (data.started) + barrier(); + + if (wait) + while (data.finished) + barrier(); + + spin_unlock(&call_lock); + + return 0; } /* Sorry about the name. In an APIC based system, the APICs @@ -1029,12 +1099,6 @@ void smp_qic_call_function_interrupt(str smp_call_function_interrupt(); } -void smp_qic_call_function_single_interrupt(struct pt_regs *regs) -{ - ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI); - smp_call_function_single_interrupt(); -} - void smp_vic_cpi_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1055,8 +1119,6 @@ void smp_vic_cpi_interrupt(struct pt_reg smp_enable_irq_interrupt(); if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) smp_call_function_interrupt(); - if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu])) - smp_call_function_single_interrupt(); set_irq_regs(old_regs); } @@ -1800,7 +1862,5 @@ struct smp_ops smp_ops = { .smp_send_stop = voyager_smp_send_stop, .smp_send_reschedule = voyager_smp_send_reschedule, - - .send_call_func_ipi = native_send_call_func_ipi, - .send_call_func_single_ipi = native_send_call_func_single_ipi, + .smp_call_function_mask = voyager_smp_call_function_mask, }; diff -puN arch/x86/xen/enlighten.c~revert-git-block arch/x86/xen/enlighten.c --- a/arch/x86/xen/enlighten.c~revert-git-block +++ a/arch/x86/xen/enlighten.c @@ -1123,9 +1123,7 @@ static const struct smp_ops xen_smp_ops .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, + .smp_call_function_mask = xen_smp_call_function_mask, }; #endif /* CONFIG_SMP */ diff -puN arch/x86/xen/mmu.c~revert-git-block arch/x86/xen/mmu.c --- a/arch/x86/xen/mmu.c~revert-git-block +++ a/arch/x86/xen/mmu.c @@ -569,7 +569,7 @@ static void drop_mm_ref(struct mm_struct } if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) diff -puN arch/x86/xen/smp.c~revert-git-block arch/x86/xen/smp.c --- a/arch/x86/xen/smp.c~revert-git-block +++ a/arch/x86/xen/smp.c @@ -36,14 +36,27 @@ #include "mmu.h" static cpumask_t xen_cpu_initialized_map; - -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); -static DEFINE_PER_CPU(int, callfuncsingle_irq); +static DEFINE_PER_CPU(int, resched_irq) = -1; +static DEFINE_PER_CPU(int, callfunc_irq) = -1; static DEFINE_PER_CPU(int, debug_irq) = -1; +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); -static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); + +static struct call_data_struct *call_data; /* * Reschedule call back. Nothing to do, @@ -109,17 +122,6 @@ static int xen_smp_intr_init(unsigned in goto fail; per_cpu(debug_irq, cpu) = rc; - callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); - rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, - cpu, - xen_call_function_single_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - callfunc_name, - NULL); - if (rc < 0) - goto fail; - per_cpu(callfuncsingle_irq, cpu) = rc; - return 0; fail: @@ -129,9 +131,6 @@ static int xen_smp_intr_init(unsigned in unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); if (per_cpu(debug_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - if (per_cpu(callfuncsingle_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); - return rc; } @@ -339,6 +338,7 @@ void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } + static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) { unsigned cpu; @@ -349,42 +349,83 @@ static void xen_send_IPI_mask(cpumask_t xen_send_IPI_one(cpu, vector); } -void xen_smp_send_call_function_ipi(cpumask_t mask) -{ - int cpu; - - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - - /* Make sure other vcpus get a chance to run if they need to. */ - for_each_cpu_mask(cpu, mask) { - if (xen_vcpu_stolen(cpu)) { - HYPERVISOR_sched_op(SCHEDOP_yield, 0); - break; - } - } -} - -void xen_smp_send_call_function_single_ipi(int cpu) -{ - xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); -} - static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ irq_enter(); - generic_smp_call_function_interrupt(); + (*func)(info); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); + if (wait) { + mb(); /* commit everything before setting finished */ + atomic_inc(&call_data->finished); + } + return IRQ_HANDLED; } -static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait) { - irq_enter(); - generic_smp_call_function_single_interrupt(); - __get_cpu_var(irq_stat).irq_call_count++; - irq_exit(); + struct call_data_struct data; + int cpus, cpu; + bool yield; - return IRQ_HANDLED; + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + cpu_clear(smp_processor_id(), mask); + + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* write everything before IPI */ + + /* Send a message to other CPUs and wait for them to respond */ + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run if they need to. */ + yield = false; + for_each_cpu_mask(cpu, mask) + if (xen_vcpu_stolen(cpu)) + yield = true; + + if (yield) + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus || + (wait && atomic_read(&data.finished) != cpus)) + cpu_relax(); + + spin_unlock(&call_lock); + + return 0; } diff -puN arch/x86/xen/xen-ops.h~revert-git-block arch/x86/xen/xen-ops.h --- a/arch/x86/xen/xen-ops.h~revert-git-block +++ a/arch/x86/xen/xen-ops.h @@ -46,8 +46,13 @@ void xen_smp_cpus_done(unsigned int max_ void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(cpumask_t mask); -void xen_smp_send_call_function_single_ipi(int cpu); +int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait); +int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait); + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); /* Declare an asm function, along with symbols needed to make it diff -puN block/Kconfig.iosched~revert-git-block block/Kconfig.iosched --- a/block/Kconfig.iosched~revert-git-block +++ a/block/Kconfig.iosched @@ -40,14 +40,6 @@ config IOSCHED_CFQ working environment, suitable for desktop systems. This is the default I/O scheduler. -config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - default y - ---help--- - The BFQ I/O scheduler tries to distribute bandwidth among - all processes in the system, according to their weights, - giving deterministic guarantees on the service provided. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -64,9 +56,6 @@ choice config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y - config DEFAULT_BFQ - bool "BFQ" if IOSCHED_CFQ=y - config DEFAULT_NOOP bool "No-op" @@ -77,7 +66,6 @@ config DEFAULT_IOSCHED default "anticipatory" if DEFAULT_AS default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ - default "bfq" if DEFAULT_BFQ default "noop" if DEFAULT_NOOP endmenu diff -puN block/Makefile~revert-git-block block/Makefile --- a/block/Makefile~revert-git-block +++ a/block/Makefile @@ -4,15 +4,13 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o ioctl.o genhd.o \ - scsi_ioctl.o + blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o diff -puN block/as-iosched.c~revert-git-block block/as-iosched.c --- a/block/as-iosched.c~revert-git-block +++ a/block/as-iosched.c @@ -450,7 +450,7 @@ static void as_antic_stop(struct as_data del_timer(&ad->antic_timer); ad->antic_status = ANTIC_FINISHED; /* see as_work_handler */ - kblockd_schedule_work(ad->q, &ad->antic_work); + kblockd_schedule_work(&ad->antic_work); } } @@ -471,7 +471,7 @@ static void as_antic_timeout(unsigned lo aic = ad->io_context->aic; ad->antic_status = ANTIC_FINISHED; - kblockd_schedule_work(q, &ad->antic_work); + kblockd_schedule_work(&ad->antic_work); if (aic->ttime_samples == 0) { /* process anticipated on has exited or timed out*/ @@ -831,7 +831,7 @@ static void as_completed_request(struct } if (ad->changed_batch && ad->nr_dispatched == 1) { - kblockd_schedule_work(q, &ad->antic_work); + kblockd_schedule_work(&ad->antic_work); ad->changed_batch = 0; if (ad->batch_data_dir == REQ_SYNC) diff -puN block/bfq-iosched.c~revert-git-block /dev/null --- a/block/bfq-iosched.c +++ /dev/null @@ -1,2742 +0,0 @@ -/* - * BFQ, or Budget Fair Queueing, disk scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe <axboe@xxxxxxxxx> - * - * Copyright (C) 2008 Fabio Checconi <fabio@xxxxxxxxxxxxxxxx> - * Paolo Valente <paolo.valente@xxxxxxxxxx> - */ -#include <linux/module.h> -#include <linux/blkdev.h> -#include <linux/elevator.h> -#include <linux/rbtree.h> -#include <linux/ioprio.h> - -/* - * tunables - */ -/* max queue in one round of service */ -static const int bfq_quantum = 4; -static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; -/* maximum backwards seek, in KiB */ -static const int bfq_back_max = 16 * 1024; -/* penalty of a backwards seek */ -static const int bfq_back_penalty = 2; -static const int bfq_slice_async_rq = 2; -static int bfq_slice_idle = HZ / 125; -static const int bfq_max_budget = 4096; - -/* - * below this threshold, we consider thinktime immediate - */ -#define BFQ_MIN_TT (2) - -#define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_BFQQ(rq) ((rq)->elevator_private2) - -static struct kmem_cache *bfq_pool; -static struct kmem_cache *bfq_ioc_pool; - -static DEFINE_PER_CPU(unsigned long, ioc_count); -static struct completion *ioc_gone; - -#define BFQ_PRIO_LISTS IOPRIO_BE_NR -#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) - -#define ASYNC (0) -#define SYNC (1) - -#define sample_valid(samples) ((samples) > 80) - -#define BFQ_IOPRIO_CLASSES 3 - -#define BFQ_BUDGET_STEP 128 - -typedef u64 bfq_timestamp_t; -typedef unsigned long bfq_weight_t; -typedef unsigned long bfq_service_t; - -struct bfq_wfqdata { - struct rb_root active; - struct rb_root idle; - - struct bfq_queue *first_idle; - struct bfq_queue *last_idle; - - bfq_timestamp_t vtime; - bfq_weight_t wsum; -}; - -#define BFQ_WFQDATA_INIT ((struct bfq_wfqdata) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -/* - * Per block device queue structure - */ -struct bfq_data { - struct request_queue *queue; - - struct bfq_wfqdata service_tree[BFQ_IOPRIO_CLASSES]; - unsigned int busy_queues; - - int queued; - int rq_in_driver; - int sync_flight; - int hw_tag; - - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct bfq_queue *active_queue; - struct cfq_io_context *active_cic; - - /* - * async queue for each priority case - */ - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; - - sector_t last_position; - - /* - * tunables, see top of file - */ - unsigned int bfq_quantum; - unsigned int bfq_fifo_expire[2]; - unsigned int bfq_back_penalty; - unsigned int bfq_back_max; - unsigned int bfq_slice_async_rq; - unsigned int bfq_slice_idle; - unsigned int bfq_max_budget; - - struct list_head cic_list; -}; - -/* - * Per process-grouping structure - */ -struct bfq_queue { - /* reference count */ - atomic_t ref; - /* parent bfq_data */ - struct bfq_data *bfqd; - /* service_tree member */ - struct rb_node rb_node; - - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - /* wfq timestamps */ - bfq_timestamp_t finish; - bfq_timestamp_t start; - - /* wfq tree the queue belongs to */ - struct rb_root *tree; - - /* minimum start time of the subtree rooted at this queue */ - bfq_timestamp_t min_start; - - /* service received and budget for the current run */ - bfq_service_t service, budget, act_budget; - /* effective weight of the queue */ - bfq_weight_t weight; - - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio, act_ioprio; - unsigned short ioprio_class, org_ioprio_class, act_ioprio_class; - - /* various state flags, see below */ - unsigned int flags; -}; - -static inline unsigned int bfq_bfqq_tree_index(struct bfq_queue *bfqq) -{ - unsigned int idx = bfqq->act_ioprio_class - 1; - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - - return idx; -} - -static inline struct bfq_wfqdata *bfq_bfqq_wfqdata(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - return &bfqd->service_tree[bfq_bfqq_tree_index(bfqq)]; -} - -enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ - BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -}; - -#define BFQ_BFQQ_FNS(name) \ -static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -{ \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -} - -BFQ_BFQQ_FNS(busy); -BFQ_BFQQ_FNS(wait_request); -BFQ_BFQQ_FNS(must_alloc); -BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); -BFQ_BFQQ_FNS(prio_changed); -BFQ_BFQQ_FNS(sync); -#undef BFQ_BFQQ_FNS - -static void bfq_dispatch_insert(struct request_queue *, struct request *); -static struct bfq_queue *bfq_get_queue(struct bfq_data *, int, - struct io_context *, gfp_t); -static void bfq_put_queue(struct bfq_queue *bfqq); -static void bfq_forget_idle(struct bfq_wfqdata *wfqd); - -static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *, - struct io_context *); - -static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic, - int is_sync) -{ - return cic->cfqq[!!is_sync]; -} - -static inline void cic_set_bfqq(struct cfq_io_context *cic, - struct bfq_queue *bfqq, int is_sync) -{ - cic->cfqq[!!is_sync] = bfqq; -} - -/* - * We regard a request as SYNC, if it's either a read or has the SYNC bit - * set (in which case it could also be direct WRITE). - */ -static inline int bfq_bio_sync(struct bio *bio) -{ - if (bio_data_dir(bio) == READ || bio_sync(bio)) - return 1; - - return 0; -} - -/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) -{ - if (bfqd->queued != 0) - kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); -} - -static int bfq_queue_empty(struct request_queue *q) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - return bfqd->queued == 0; -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closest to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request * -bfq_choose_req(struct bfq_data *bfqd, struct request *rq1, struct request *rq2) -{ - sector_t last, s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - - if (rq1 == NULL || rq1 == rq2) - return rq2; - if (rq2 == NULL) - return rq1; - - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if (rq_is_meta(rq1) && !rq_is_meta(rq2)) - return rq1; - else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) - return rq2; - - s1 = rq1->sector; - s2 = rq2->sector; - - last = bfqd->last_position; - - /* - * by definition, 1KiB is 2 sectors - */ - back_max = bfqd->bfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - else { - if (s1 >= s2) - return rq1; - else - return rq2; - } - - case BFQ_RQ2_WRAP: - return rq1; - case BFQ_RQ1_WRAP: - return rq2; - case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -/* - * would be nice to take fifo expire time into account as well - */ -static struct request * -bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev != NULL) - prev = rb_entry_rq(rbprev); - - if (rbnext != NULL) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&bfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return bfq_choose_req(bfqd, next, prev); -} - -/* - * Shift for timestamp calculations. This actually limits the maximum - * service allowed in one go (small shift values increase it), and the - * maximum total weight of a queue (big shift values increase it), and - * the period of virtual time wraparounds. - */ -#define WFQ_SERVICE_SHIFT 22 - -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b) -{ - return (s64)(a - b) > 0; -} - -/** - * bfq_delta - map service into the virtual time domain. - * @service: amount of service. - * @weight: scale factor. - */ -static inline bfq_timestamp_t bfq_delta(bfq_service_t service, - bfq_weight_t weight) -{ - bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT; - - do_div(d, weight); - return d; -} - -/** - * bfq_calc_finish - assign the finish time to a queue. - * @bfqq: the queue to act upon. - */ -static inline void bfq_calc_finish(struct bfq_queue *bfqq) -{ - BUG_ON(bfqq->budget == 0); - - bfqq->finish = bfqq->start + bfq_delta(bfqq->act_budget, bfqq->weight); -} - -/** - * bfq_bfqq_of - get a bfqq from a node. - * @node: the node field of the bfqq. - * - * Convert a node pointer to the relative queue. This is used only - * to simplify the logic of some functions and not as the generic - * conversion mechanism because, e.g., in the tree walking functions, - * the check for a %NULL value would be redundant. - */ -static inline struct bfq_queue *bfq_bfqq_of(struct rb_node *node) -{ - struct bfq_queue *bfqq = NULL; - - if (node != NULL) - bfqq = rb_entry(node, struct bfq_queue, rb_node); - - return bfqq; -} - -/** - * bfq_extract - remove a queue from a tree. - * @root: the tree root. - * @bfqq: the queue to remove. - */ -static inline void bfq_extract(struct rb_root *root, - struct bfq_queue *bfqq) -{ - BUG_ON(bfqq->tree != root); - - bfqq->tree = NULL; - rb_erase(&bfqq->rb_node, root); -} - -/** - * bfq_idle_extract - extract a queue from the idle tree. - * @wfqd: the wfqdata of the device owning @bfqq. - * @bfqq: the queue being removed. - */ -static void bfq_idle_extract(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - struct rb_node *next; - - BUG_ON(bfqq->tree != &wfqd->idle); - - if (bfqq == wfqd->first_idle) { - next = rb_next(&bfqq->rb_node); - wfqd->first_idle = bfq_bfqq_of(next); - } - - if (bfqq == wfqd->last_idle) { - next = rb_prev(&bfqq->rb_node); - wfqd->last_idle = bfq_bfqq_of(next); - } - - bfq_extract(&wfqd->idle, bfqq); -} - -/** - * bfq_update_finish - resync the finish time with the service received - * @bfqq: the queue to update. - * - * The queue may have received less service than allocated, decrease its - * finish time. This is called only for the queue under service. - */ -static inline void bfq_update_finish(struct bfq_queue *bfqq) -{ - BUG_ON(bfqq->finish < bfqq->start + - bfq_delta(bfqq->service, bfqq->weight)); - - bfqq->finish = bfqq->start + bfq_delta(bfqq->service, bfqq->weight); -} - -/** - * bfq_insert - generic tree insertion. - * @root: tree root. - * @bfqq: queue to insert. - * - * This is used for the idle and the active tree, since they are both - * ordered by finish time. - */ -static void bfq_insert(struct rb_root *root, struct bfq_queue *bfqq) -{ - struct bfq_queue *entry; - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - - while (*node != NULL) { - parent = *node; - entry = rb_entry(parent, struct bfq_queue, rb_node); - - if (bfq_gt(entry->finish, bfqq->finish)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&bfqq->rb_node, parent, node); - rb_insert_color(&bfqq->rb_node, root); - - bfqq->tree = root; -} - -/** - * bfq_update_min - update the min_start field of a queue. - * @bfqq: the queue to update. - * @node: one of its children. - * - * This function is called when @bfqq may store an invalid value for - * min_start due to updates to the active tree. It assumes that the subtree - * rooted at @node (that may be its left or its right child) has a valid - * min_start value. - */ -static inline void bfq_update_min(struct bfq_queue *bfqq, - struct rb_node *node) -{ - struct bfq_queue *child; - - if (node != NULL) { - child = rb_entry(node, struct bfq_queue, rb_node); - if (bfq_gt(bfqq->min_start, child->min_start)) - bfqq->min_start = child->min_start; - } -} - -/** - * bfq_update_active_node - recalculate min_start. - * @node: the node to update. - * - * @node may have changed position or one of its children can have moved, - * this function updates its min_start value. The left and right subtrees - * are assumed to hold a correct min_start value. - */ -static inline void bfq_update_active_node(struct rb_node *node) -{ - struct bfq_queue *bfqq = rb_entry(node, struct bfq_queue, rb_node); - - bfqq->min_start = bfqq->start; - bfq_update_min(bfqq, node->rb_right); - bfq_update_min(bfqq, node->rb_left); -} - -/** - * bfq_update_active_tree - update min_start for the whole active tree. - * @node: the starting node. - * - * @node must be the deepest modified node after an update. This function - * updates its min_start using the values held by its children, assuming - * that they did not change, and then updates all the nodes that may have - * changed in the path to the root. The only nodes that may have changed - * are those in the path or their siblings. - */ -static void bfq_update_active_tree(struct rb_node *node) -{ - struct rb_node *parent; - -up: - bfq_update_active_node(node); - - parent = rb_parent(node); - if (parent == NULL) - return; - - if (node == parent->rb_left && parent->rb_right != NULL) - bfq_update_active_node(parent->rb_right); - else if (parent->rb_left != NULL) - bfq_update_active_node(parent->rb_left); - - node = parent; - goto up; -} - -/** - * bfq_active_insert - insert a queue in the active tree of its device. - * @wfqd: the wfqdata of the device data containing the tree. - * @bfqq: the queue being inserted. - * - * The active tree is ordered by finish time, but an extra key is kept - * per each node, containing the minimum value for the start times of - * its children (and the node itself), so it's possible to search for - * the eligible node with the lowest finish time in logarithmic time. - */ -static void bfq_active_insert(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - struct rb_node *node = &bfqq->rb_node; - - bfq_insert(&wfqd->active, bfqq); - - if (node->rb_left != NULL) - node = node->rb_left; - else if (node->rb_right != NULL) - node = node->rb_right; - - bfq_update_active_tree(node); -} - -/** - * bfq_ioprio_to_weight - calc the weight for a queue. - * @bfqq: the queue to act upon. - */ -static bfq_weight_t bfq_ioprio_to_weight(struct bfq_queue *bfqq) -{ - WARN_ON(bfqq->act_ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR - bfqq->act_ioprio; -} - -/** - * bfq_update_weight - update the weight of a queue. - * @wfqd: wfqdata for the device. - * @bfqq: queue to act upon. - * @old_weight: weight @bfqq had on @wfqdata. - */ -static void bfq_update_weight(struct bfq_wfqdata **wfqd, - struct bfq_queue *bfqq, - bfq_weight_t old_weight) -{ - struct bfq_data *bfqd = bfqq->bfqd; - struct bfq_wfqdata *new_wfqd = *wfqd; - - if (bfqq->act_ioprio != bfqq->ioprio || - bfqq->act_ioprio_class != bfqq->ioprio_class) { - bfqq->act_ioprio = bfqq->ioprio; - bfqq->act_ioprio_class = bfqq->ioprio_class; - bfqq->weight = bfq_ioprio_to_weight(bfqq); - new_wfqd = &bfqd->service_tree[bfq_bfqq_tree_index(bfqq)]; - if (new_wfqd != *wfqd) - bfqq->start = new_wfqd->vtime; - } else if (old_weight != 0) - /* Already enqueued with the same weight. */ - return; - - (*wfqd)->wsum -= old_weight; - new_wfqd->wsum += bfqq->weight; - *wfqd = new_wfqd; -} - -/** - * bfq_activate_bfqq - activate a queue. - * @bfqd: the device data. - * @bfqq: the queue being activated. - * - * Called whenever a queue is activated, i.e., it is not active and - * receives a new request, or has to be reactivated due to budget - * exhaustion. It uses the current budget of the queue (and the service - * received if @bfqq is active) of the queue to calculate its timestamps. - */ -static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_wfqdata *wfqd = bfq_bfqq_wfqdata(bfqd, bfqq); - bfq_weight_t old_weight; - - if (bfqq == bfqd->active_queue) { - BUG_ON(bfqq->tree != NULL); - /* - * If we are requeueing the current queue we have - * to take care of not charging to it service it has - * not received. - */ - bfq_update_finish(bfqq); - bfqq->start = bfqq->finish; - old_weight = bfqq->weight; - } else if (bfqq->tree != NULL) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(wfqd, bfqq); - bfqq->start = bfq_gt(wfqd->vtime, bfqq->finish) ? - wfqd->vtime : bfqq->finish; - old_weight = bfqq->weight; - } else { - /* - * The finish time of the queue can be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - bfqq->start = wfqd->vtime; - atomic_inc(&bfqq->ref); - old_weight = 0; - } - - bfq_update_weight(&wfqd, bfqq, old_weight); - bfq_calc_finish(bfqq); - bfq_active_insert(wfqd, bfqq); -} - -/** - * bfq_find_deepest - find the deepest node that an extraction can modify. - * @node: the node being removed. - * - * Do the first step of an extraction in an rb tree, looking for the - * node that will replace @node, and returning the deepest node that - * the following modifications to the tree can touch. If @node is the - * last node in the tree return %NULL. - */ -static struct rb_node *bfq_find_deepest(struct rb_node *node) -{ - struct rb_node *deepest; - - if (node->rb_right == NULL && node->rb_left == NULL) - deepest = rb_parent(node); - else if (node->rb_right == NULL) - deepest = node->rb_left; - else if (node->rb_left == NULL) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right != NULL) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} - -/** - * bfq_active_extract - remove an entity from the active tree. - * @wfqd: the wfqdata containing the tree. - * @bfqq: the queue being removed. - */ -static void bfq_active_extract(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - struct rb_node *node; - - node = bfq_find_deepest(&bfqq->rb_node); - bfq_extract(&wfqd->active, bfqq); - - if (node != NULL) - bfq_update_active_tree(node); -} - -/** - * bfq_idle_insert - insert an entity into the idle tree. - * @wfqd: the queue containing the tree. - * @bfqq: the queue to insert. - */ -static void bfq_idle_insert(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - struct bfq_queue *first_idle = wfqd->first_idle; - struct bfq_queue *last_idle = wfqd->last_idle; - - if (first_idle == NULL || bfq_gt(first_idle->finish, bfqq->finish)) - wfqd->first_idle = bfqq; - if (last_idle == NULL || bfq_gt(bfqq->finish, last_idle->finish)) - wfqd->last_idle = bfqq; - - bfq_insert(&wfqd->idle, bfqq); -} - -/** - * bfq_forget_queue - remove a queue from the wfq trees. - * @wfqd: the wfqdata. - * @bfqq: the queue being removed. - * - * Update the device status and forget everything about @bfqq, putting - * the device reference to it. - */ -static void bfq_forget_queue(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - wfqd->wsum -= bfqq->weight; - bfq_put_queue(bfqq); -} - -static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_wfqdata *wfqd = bfq_bfqq_wfqdata(bfqd, bfqq); - - if (bfqq == bfqd->active_queue) { - BUG_ON(bfqq->tree != NULL); - bfq_update_finish(bfqq); - bfqd->active_queue = NULL; - } else - bfq_active_extract(wfqd, bfqq); - - if (bfq_gt(bfqq->finish, wfqd->vtime)) - bfq_idle_insert(wfqd, bfqq); - else - bfq_forget_queue(wfqd, bfqq); -} - -/** - * bfq_put_idle_queue - release the idle tree ref of a queue. - * @wfqd: wfqdata of the device. - * @bfqq: the queue being released. - */ -static void bfq_put_idle_queue(struct bfq_wfqdata *wfqd, - struct bfq_queue *bfqq) -{ - bfq_idle_extract(wfqd, bfqq); - bfq_forget_queue(wfqd, bfqq); -} - -/** - * bfq_bfqq_served - update the scheduler status after service. - * @bfqd: the device data. - * @bfqq: the queue being served. - * @served: bytes transfered/to transfer. - */ -static void bfq_bfqq_served(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_service_t served) -{ - struct bfq_wfqdata *wfqd = bfq_bfqq_wfqdata(bfqd, bfqq); - - WARN_ON_ONCE(bfqq->service > bfqq->act_budget); - - bfqq->service += served; - WARN_ON_ONCE(bfqq->service > bfqq->act_budget); - wfqd->vtime += bfq_delta(served, wfqd->wsum); - - bfq_forget_idle(wfqd); -} - -/* - * Called when an inactive queue receives a new request. - */ -static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqq == bfqd->active_queue); - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; - - bfq_activate_bfqq(bfqd, bfqq); -} - -/* - * Called when the bfqq no longer has requests pending, remove it from - * the service tree. - */ -static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - bfq_clear_bfqq_busy(bfqq); - bfq_deactivate_bfqq(bfqd, bfqq); - - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; -} - -/* - * rb tree support functions - */ -static void bfq_del_rq_rb(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - - BUG_ON(bfqq->queued[sync] == 0); - bfqq->queued[sync]--; - bfqd->queued--; - - elv_rb_del(&bfqq->sort_list, rq); - - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue && - RB_EMPTY_ROOT(&bfqq->sort_list)) - bfq_del_bfqq_busy(bfqd, bfqq); -} - -/** - * bfq_updated_next_req - update the queue after a new next_rq selection. - * @bfqd: the device data the queue belongs to. - * @bfqq: the queue to update. - * - * Whenever the first request of a queue changes we try to allocate it - * enough service (if it has grown), or to anticipate its finish time - * (if it has shrinked), to reduce the time it has to wait, still taking - * into account the queue budget. We try to avoid the queue having not - * enough service allocated for its first request, thus having to go - * through two dispatch rounds to actually dispatch the request. - */ -static void bfq_updated_next_req(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_wfqdata *wfqd = bfq_bfqq_wfqdata(bfqd, bfqq); - struct request *next_rq = bfqq->next_rq; - bfq_service_t new_budget; - - if (next_rq == NULL) - return; - - if (bfqq == bfqd->active_queue) - /* - * In order not to break guarantees, budgets cannot be - * changed after an activity has been selected. - */ - return; - - BUG_ON(bfqq->tree != &wfqd->active); - - new_budget = max(bfqq->budget, next_rq->hard_nr_sectors); - if (new_budget <= bfqq->act_budget) - /* - * Finish times cannot be decreased while the queue - * is either schedulable or not eligible, as it would - * invalidate previous scheduling decisions. The - * current budget is enough to satisfy the first req - * anyway. - */ - return; - - bfqq->act_budget = new_budget; - bfq_active_extract(wfqd, bfqq); - bfq_calc_finish(bfqq); - bfq_active_insert(wfqd, bfqq); -} - -static void bfq_add_rq_rb(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - struct request *__alias, *next_rq; - - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - /* - * looks a little odd, but the first insert might return an alias. - * if that happens, put the alias on the dispatch list - */ - while ((__alias = elv_rb_add(&bfqq->sort_list, rq)) != NULL) - bfq_dispatch_insert(bfqd->queue, __alias); - - /* - * check if this request is a better next-serve candidate - */ - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq); - BUG_ON(next_rq == NULL); - bfqq->next_rq = next_rq; - - if (!bfq_bfqq_busy(bfqq)) { - bfqq->act_budget = max(bfqq->budget, next_rq->hard_nr_sectors); - bfq_add_bfqq_busy(bfqd, bfqq); - } else - bfq_updated_next_req(bfqd, bfqq); -} - -static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) -{ - elv_rb_del(&bfqq->sort_list, rq); - bfqq->queued[rq_is_sync(rq)]--; - bfqq->bfqd->queued--; - bfq_add_rq_rb(rq); -} - -static struct request * -bfq_find_rq_fmerge(struct bfq_data *bfqd, struct bio *bio) -{ - struct task_struct *tsk = current; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - cic = bfq_cic_lookup(bfqd, tsk->io_context); - if (cic == NULL) - return NULL; - - bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); - if (bfqq != NULL) { - sector_t sector = bio->bi_sector + bio_sectors(bio); - - return elv_rb_find(&bfqq->sort_list, sector); - } - - return NULL; -} - -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; - - /* - * If the depth is larger 1, it really could be queueing. But lets - * make the mark a little higher - idling could still be good for - * low queueing, and a low queueing number could also just indicate - * a SCSI mid layer like behaviour where limit+1 is often seen. - */ - if (!bfqd->hw_tag && bfqd->rq_in_driver > 4) - bfqd->hw_tag = 1; - - bfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - WARN_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; -} - -static void bfq_remove_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); - } - - list_del_init(&rq->queuelist); - bfq_del_rq_rb(rq); - - if (rq_is_meta(rq)) { - WARN_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -} - -static int bfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void bfq_merged_request(struct request_queue *q, struct request *req, - int type) -{ - if (type == ELEVATOR_FRONT_MERGE) { - struct bfq_queue *bfqq = RQ_BFQQ(req); - - bfq_reposition_rq_rb(bfqq, req); - } -} - -static void -bfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - /* - * reposition in fifo if next is older than rq - */ - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(next->start_time, rq->start_time)) - list_move(&rq->queuelist, &next->queuelist); - - bfq_remove_request(next); -} - -static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ - if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return 0; - - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - */ - cic = bfq_cic_lookup(bfqd, current->io_context); - if (cic == NULL) - return 0; - - bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); - if (bfqq == RQ_BFQQ(rq)) - return 1; - - return 0; -} - -static void __bfq_set_active_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - if (bfqq != NULL) { - bfq_mark_bfqq_must_alloc(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - bfqq->service = 0; - } - - bfqd->active_queue = bfqq; -} - -/** - * bfq_forget_idle - update the idle tree if necessary. - * @wfqd: the wfqdata to act upon. - * - * To preserve the global O(log N) complexity we only remove one entry here; - * as the idle tree will not grow indefinitely this can be done safely. - */ -static void bfq_forget_idle(struct bfq_wfqdata *wfqd) -{ - struct bfq_queue *first_idle = wfqd->first_idle; - struct bfq_queue *last_idle = wfqd->last_idle; - - if (RB_EMPTY_ROOT(&wfqd->active) && last_idle != NULL) { - /* - * Forget the whole idle tree, increasing the vtime past - * the last finish time of idle entities. - */ - wfqd->vtime = last_idle->finish; - } - - if (first_idle != NULL && !bfq_gt(first_idle->finish, wfqd->vtime)) - bfq_put_idle_queue(wfqd, first_idle); -} - -/** - * bfq_update_vtime - update vtime if necessary. - * @queue: the wfqdata to act upon. - * - * If necessary update the device vtime to have at least one eligible - * entity, skipping to its start time. Assumes that the active tree - * of the device is not empty. - */ -static void bfq_update_vtime(struct bfq_wfqdata *wfqd) -{ - struct bfq_queue *entry; - struct rb_node *node = wfqd->active.rb_node; - - entry = rb_entry(node, struct bfq_queue, rb_node); - if (bfq_gt(entry->min_start, wfqd->vtime)) { - wfqd->vtime = entry->min_start; - bfq_forget_idle(wfqd); - } -} - -/** - * bfq_first_active - find the eligible entity with the smallest finish time - * @wfqd: the wfqdata to select from. - * - * This function searches the first schedulable queue, starting from the - * root of the tree and going on the left every time on this side there is - * a subtree with at least one eligible (start >= vtime) entity. The path - * on the right is followed only if a) the left subtree contains no eligible - * queue and b) no eligible queue has been found yet. - */ -static struct bfq_queue *bfq_first_active(struct bfq_wfqdata *wfqd) -{ - struct bfq_queue *entry, *first = NULL; - struct rb_node *node = wfqd->active.rb_node; - - while (node != NULL) { - entry = rb_entry(node, struct bfq_queue, rb_node); -left: - if (!bfq_gt(entry->start, wfqd->vtime)) - first = entry; - - BUG_ON(bfq_gt(entry->min_start, wfqd->vtime)); - - if (node->rb_left != NULL) { - entry = rb_entry(node->rb_left, - struct bfq_queue, rb_node); - if (!bfq_gt(entry->min_start, wfqd->vtime)) { - node = node->rb_left; - goto left; - } - } - if (first != NULL) - break; - node = node->rb_right; - } - - return first; -} - -static struct bfq_queue *bfq_wfqnext(struct bfq_wfqdata *wfqd) -{ - struct bfq_queue *bfqq; - - if (RB_EMPTY_ROOT(&wfqd->active)) - return NULL; - - bfq_update_vtime(wfqd); - bfqq = bfq_first_active(wfqd); - bfq_active_extract(wfqd, bfqq); - - BUG_ON(bfq_gt(bfqq->start, wfqd->vtime)); - - return bfqq; -} - -/* - * Get next queue for service. - */ -static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - int i; - - BUG_ON(bfqd->active_queue != NULL); - - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - bfqq = bfq_wfqnext(&bfqd->service_tree[i]); - if (bfqq != NULL) - break; - } - - return bfqq; -} - -/* - * Get and set a new active queue for service. - */ -static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - - bfqq = bfq_get_next_queue(bfqd); - __bfq_set_active_queue(bfqd, bfqq); - return bfqq; -} - -static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, - struct request *rq) -{ - if (rq->sector >= bfqd->last_position) - return rq->sector - bfqd->last_position; - else - return bfqd->last_position - rq->sector; -} - -static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) -{ - struct cfq_io_context *cic = bfqd->active_cic; - - if (!sample_valid(cic->seek_samples)) - return 0; - - return bfq_dist_from_last(bfqd, rq) <= cic->seek_mean; -} - -static int bfq_close_cooperator(struct bfq_data *bfq_data, - struct bfq_queue *bfqq) -{ - /* - * We should notice if some of the queues are cooperating, eg - * working closely on the same area of the disk. In that case, - * we can group them together and don't waste time idling. - */ - return 0; -} - -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) - -static void bfq_arm_slice_timer(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->active_queue; - struct cfq_io_context *cic; - unsigned long sl; - - WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* - * idle is disabled, either manually or by past process history - */ - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) - return; - - /* - * task has exited, don't wait - */ - cic = bfqd->active_cic; - if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0) - return; - - /* - * See if this prio level has a good candidate - */ - if (bfq_close_cooperator(bfqd, bfqq) && - (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2)) - return; - - bfq_mark_bfqq_wait_request(bfqq); - - /* - * we don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. so allow a little bit of time for him to submit a new rq - */ - sl = bfqd->bfq_slice_idle; - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - - mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -} - -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - bfq_remove_request(rq); - bfqq->dispatched++; - elv_dispatch_sort(q, rq); - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -} - -/* - * return expired entry, or NULL to just start from scratch in rbtree - */ -static struct request *bfq_check_fifo(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - struct request *rq; - int fifo; - - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; - - bfq_mark_bfqq_fifo_expire(bfqq); - - if (list_empty(&bfqq->fifo)) - return NULL; - - fifo = bfq_bfqq_sync(bfqq); - rq = rq_entry_fifo(bfqq->fifo.next); - - if (time_before(jiffies, rq->start_time + bfqd->bfq_fifo_expire[fifo])) - return NULL; - - return rq; -} - -static inline int -bfq_prio_to_maxrq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - const int base_rq = bfqd->bfq_slice_async_rq; - - WARN_ON(bfqq->ioprio >= IOPRIO_BE_NR); - - return 2 * (base_rq + base_rq * (BFQ_PRIO_LISTS - 1 - bfqq->ioprio)); -} - -static inline bfq_service_t bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - return bfqq->act_budget - bfqq->service; -} - -static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfqq != bfqd->active_queue); - - if (bfqd->active_cic != NULL) { - put_io_context(bfqd->active_cic->ioc); - bfqd->active_cic = NULL; - } - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) - bfq_del_bfqq_busy(bfqd, bfqq); - else - bfq_activate_bfqq(bfqd, bfqq); - - bfqd->active_queue = NULL; - del_timer(&bfqd->idle_slice_timer); -} - -static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - int timed_out) -{ - struct request *next_rq; - - BUG_ON(bfqq != bfqd->active_queue); - - if (timed_out == 0) { - bfqq->budget = min(bfqq->budget + BFQ_BUDGET_STEP, - (bfq_service_t)bfqd->bfq_max_budget); - - /* - * This is to be sure that we have enough budget for the - * next request, and is correct only because we are sure - * that the the active queue will be requeued immediately, - * since the queue may not be the one to serve (its finish - * timestamp needs to be updated to the new budget.) - * IOW __bfq_bfqq_recalc_budget() must be followed by - * __bfq_bfqq_expire(). - */ - next_rq = bfqq->next_rq; - bfqq->act_budget = max(bfqq->budget, next_rq->hard_nr_sectors); - } else - bfqq->budget = max(bfqq->service, (bfq_service_t)4); -} - -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - int timed_out) -{ - __bfq_bfqq_recalc_budget(bfqd, bfqq, timed_out); - __bfq_bfqq_expire(bfqd, bfqq); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq; - struct request *next_rq; - - bfqq = bfqd->active_queue; - if (bfqq == NULL) - goto new_queue; - - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve it keep the queue, otherwise expire it. - */ - if (next_rq != NULL) { - if (next_rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) { - __bfq_bfqq_recalc_budget(bfqd, bfqq, 0); - goto expire; - } else - goto keep_queue; - } - - /* - * No requests pending. If the active queue still has requests in - * flight or is idling for a new request, allow either of these - * conditions to happen (or time out) before selecting a new queue. - */ - if (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq))) { - bfqq = NULL; - goto keep_queue; - } - -expire: - __bfq_bfqq_expire(bfqd, bfqq); -new_queue: - bfqq = bfq_set_active_queue(bfqd); -keep_queue: - return bfqq; -} - -/* - * Dispatch some requests from bfqq, moving them to the request queue - * dispatch list. - */ -static int -__bfq_dispatch_requests(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int max_dispatch) -{ - int dispatched = 0; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - do { - struct request *rq; - - /* - * follow expired path, else get first next available - */ - rq = bfq_check_fifo(bfqq); - if (rq == NULL) - rq = bfqq->next_rq; - - if (rq->hard_nr_sectors > bfq_bfqq_budget_left(bfqq)) { - /* - * Expire the queue for budget exhaustion, and - * make sure that the next act_budget is enough - * to serve the next request, even if it comes - * from the fifo expired path. - */ - bfqq->next_rq = rq; - bfq_bfqq_expire(bfqd, bfqq, 0); - goto out; - } - - /* - * finally, insert request into driver dispatch list - */ - bfq_bfqq_served(bfqd, bfqq, rq->hard_nr_sectors); - bfq_dispatch_insert(bfqd->queue, rq); - - dispatched++; - - if (bfqd->active_cic == NULL) { - atomic_inc(&RQ_CIC(rq)->ioc->refcount); - bfqd->active_cic = RQ_CIC(rq); - } - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) - break; - } while (dispatched < max_dispatch); - - /* - * Expire an async queue immediately if it has used up its slice. - * Idle queues always expire after 1 dispatch round. A better - * approach to handle async queues would be to use a max_async_budget - * instead of slice_asyn_rq. - */ - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfq_prio_to_maxrq(bfqd, bfqq)) || - bfq_class_idle(bfqq))) - __bfq_bfqq_expire(bfqd, bfqq); - -out: - return dispatched; -} - -static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -{ - int dispatched = 0; - - while (bfqq->next_rq != NULL) { - bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&bfqq->fifo)); - return dispatched; -} - -/* - * Drain our current requests. Used for barriers and when switching - * io schedulers on-the-fly. - */ -static int bfq_forced_dispatch(struct bfq_data *bfqd) -{ - struct bfq_wfqdata *wfqd; - struct bfq_queue *bfqq; - int dispatched = 0, i; - struct rb_node *n; - - bfqq = bfqd->active_queue; - if (bfqq != NULL) - __bfq_bfqq_expire(bfqd, bfqq); - - /* - * Loop through classes, and be careful to leave the scheduler - * in a consistent state, as feedback mechanisms and vtime - * updates cannot be disabled during the process. - */ - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - wfqd = &bfqd->service_tree[i]; - while ((n = rb_first(&wfqd->active)) != NULL) { - bfqq = rb_entry(n, struct bfq_queue, rb_node); - dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->budget = bfqd->bfq_max_budget; - } - bfq_forget_idle(wfqd); - } - - BUG_ON(bfqd->busy_queues != 0); - - return dispatched; -} - -static int bfq_dispatch_requests(struct request_queue *q, int force) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - int dispatched; - - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - - dispatched = 0; - while ((bfqq = bfq_select_queue(bfqd)) != NULL) { - int max_dispatch; - - max_dispatch = bfqd->bfq_quantum; - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - break; - if (bfqq->dispatched >= 4 * max_dispatch) - break; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - break; - - bfq_clear_bfqq_wait_request(bfqq); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - - dispatched += __bfq_dispatch_requests(bfqd, bfqq, max_dispatch); - } - - return dispatched; -} - -/* - * task holds one reference to the queue, dropped when task exits. each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * queue lock must be held here. - */ -static void bfq_put_queue(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - - BUG_ON(atomic_read(&bfqq->ref) <= 0); - - if (!atomic_dec_and_test(&bfqq->ref)) - return; - - BUG_ON(rb_first(&bfqq->sort_list) != NULL); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->active_queue == bfqq); - - kmem_cache_free(bfq_pool, bfqq); -} - -/* - * Call func for each cic attached to this ioc. - */ -static void -call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) -{ - struct cfq_io_context *cic; - struct hlist_node *n; - - rcu_read_lock(); - hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) - func(ioc, cic); - rcu_read_unlock(); -} - -static void bfq_cic_free_rcu(struct rcu_head *head) -{ - struct cfq_io_context *cic; - - cic = container_of(head, struct cfq_io_context, rcu_head); - - kmem_cache_free(bfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); - - if (ioc_gone && !elv_ioc_count_read(ioc_count)) - complete(ioc_gone); -} - -static void bfq_cic_free(struct cfq_io_context *cic) -{ - call_rcu(&cic->rcu_head, bfq_cic_free_rcu); -} - -static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) -{ - unsigned long flags; - - BUG_ON(cic->dead_key == 0); - - spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, cic->dead_key); - hlist_del_rcu(&cic->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - bfq_cic_free(cic); -} - -static void bfq_free_io_context(struct io_context *ioc) -{ - /* - * ioc->refcount is zero here, or we are called from elv_unregister(), - * so no more cic's are allowed to be linked into this ioc. So it - * should be ok to iterate over the known list, we will see all cic's - * since no new ones are added. - */ - call_for_each_cic(ioc, cic_free_func); -} - -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - if (bfqq == bfqd->active_queue) { - __bfq_bfqq_expire(bfqd, bfqq); - bfq_schedule_dispatch(bfqd); - } - - bfq_put_queue(bfqq); -} - -static void __bfq_exit_single_io_context(struct bfq_data *bfqd, - struct cfq_io_context *cic) -{ - struct io_context *ioc = cic->ioc; - - list_del_init(&cic->queue_list); - - /* - * Make sure key == NULL is seen for dead queues - */ - smp_wmb(); - cic->dead_key = (unsigned long)cic->key; - cic->key = NULL; - - if (ioc->ioc_data == cic) - rcu_assign_pointer(ioc->ioc_data, NULL); - - if (cic->cfqq[ASYNC] != NULL) { - bfq_exit_bfqq(bfqd, cic->cfqq[ASYNC]); - cic->cfqq[ASYNC] = NULL; - } - - if (cic->cfqq[SYNC] != NULL) { - bfq_exit_bfqq(bfqd, cic->cfqq[SYNC]); - cic->cfqq[SYNC] = NULL; - } -} - -static void bfq_exit_single_io_context(struct io_context *ioc, - struct cfq_io_context *cic) -{ - struct bfq_data *bfqd = cic->key; - - if (bfqd != NULL) { - struct request_queue *q = bfqd->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __bfq_exit_single_io_context(bfqd, cic); - spin_unlock_irqrestore(q->queue_lock, flags); - } -} - -/* - * The process that ioc belongs to has exited, we need to clean up - * and put the internal structures we have that belongs to that process. - */ -static void bfq_exit_io_context(struct io_context *ioc) -{ - call_for_each_cic(ioc, bfq_exit_single_io_context); -} - -static struct cfq_io_context * -bfq_alloc_io_context(struct bfq_data *bfqd, gfp_t gfp_mask) -{ - struct cfq_io_context *cic; - - cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO, - bfqd->queue->node); - if (cic != NULL) { - cic->last_end_request = jiffies; - INIT_LIST_HEAD(&cic->queue_list); - INIT_HLIST_NODE(&cic->cic_list); - cic->dtor = bfq_free_io_context; - cic->exit = bfq_exit_io_context; - elv_ioc_count_inc(ioc_count); - } - - return cic; -} - -/* - * With BFQ priorities cannot change anywhere, so the values used to store - * the actual ioprio/class of a queue are old_ioprio and old_ioprio_class, - * that are synced with the ones assigned here (and by the boosting code) - * only when the queue can change its priority. This function must be - * called in the context of the task owning ioc so we cannot delay it to - * the next (re-)activation of the queue. - */ -static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc) -{ - struct task_struct *tsk = current; - int ioprio_class; - - if (!bfq_bfqq_prio_changed(bfqq)) - return; - - ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); - switch (ioprio_class) { - default: - printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* - * no prio set, place us in the middle of the BE classes - */ - bfqq->ioprio = task_nice_ioprio(tsk); - bfqq->ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_RT: - bfqq->ioprio = task_ioprio(ioc); - bfqq->ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - bfqq->ioprio = task_ioprio(ioc); - bfqq->ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - bfqq->ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); - break; - } - - /* - * keep track of original prio settings in case we have to temporarily - * elevate the priority of this queue - */ - bfqq->org_ioprio = bfqq->ioprio; - bfqq->org_ioprio_class = bfqq->ioprio_class; - bfq_clear_bfqq_prio_changed(bfqq); -} - -static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) -{ - struct bfq_data *bfqd = cic->key; - struct bfq_queue *bfqq; - unsigned long flags; - - if (unlikely(bfqd == NULL)) - return; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - - bfqq = cic->cfqq[ASYNC]; - if (bfqq != NULL) { - struct bfq_queue *new_bfqq; - new_bfqq = bfq_get_queue(bfqd, ASYNC, cic->ioc, GFP_ATOMIC); - if (new_bfqq != NULL) { - cic->cfqq[ASYNC] = new_bfqq; - bfq_put_queue(bfqq); - } - } - - bfqq = cic->cfqq[SYNC]; - if (bfqq != NULL) - bfq_mark_bfqq_prio_changed(bfqq); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -} - -static void bfq_ioc_set_ioprio(struct io_context *ioc) -{ - call_for_each_cic(ioc, changed_ioprio); - ioc->ioprio_changed = 0; -} - -static struct bfq_queue * -bfq_find_alloc_queue(struct bfq_data *bfqd, int is_sync, - struct io_context *ioc, gfp_t gfp_mask) -{ - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct cfq_io_context *cic; - -retry: - cic = bfq_cic_lookup(bfqd, ioc); - /* cic always exists here */ - bfqq = cic_to_bfqq(cic, is_sync); - - if (bfqq == NULL) { - if (new_bfqq != NULL) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - /* - * Inform the allocator of the fact that we will - * just repeat this allocation if it fails, to allow - * the allocator to do whatever it needs to attempt to - * free memory. - */ - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_NOFAIL | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - if (bfqq == NULL) - goto out; - } - - RB_CLEAR_NODE(&bfqq->rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - - atomic_set(&bfqq->ref, 0); - bfqq->bfqd = bfqd; - bfqq->budget = bfqd->bfq_max_budget; - - bfq_mark_bfqq_prio_changed(bfqq); - - bfq_init_prio_data(bfqq, ioc); - bfqq->act_ioprio = bfqq->ioprio; - bfqq->act_ioprio_class = bfqq->ioprio_class; - bfqq->weight = bfq_ioprio_to_weight(bfqq); - - if (is_sync) { - if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); - bfq_mark_bfqq_sync(bfqq); - } - } - - if (new_bfqq != NULL) - kmem_cache_free(bfq_pool, new_bfqq); - -out: - WARN_ON((gfp_mask & __GFP_WAIT) && bfqq == NULL); - return bfqq; -} - -static struct bfq_queue ** -bfq_async_queue_prio(struct bfq_data *bfqd, int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &bfqd->async_bfqq[0][ioprio]; - case IOPRIO_CLASS_BE: - return &bfqd->async_bfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &bfqd->async_idle_bfqq; - default: - BUG(); - } -} - -static struct bfq_queue * -bfq_get_queue(struct bfq_data *bfqd, int is_sync, struct io_context *ioc, - gfp_t gfp_mask) -{ - const int ioprio = task_ioprio(ioc); - const int ioprio_class = task_ioprio_class(ioc); - struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; - - if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class, ioprio); - bfqq = *async_bfqq; - } - - if (bfqq == NULL) { - bfqq = bfq_find_alloc_queue(bfqd, is_sync, ioc, gfp_mask); - if (bfqq == NULL) - return NULL; - } - - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && *async_bfqq == NULL) { - atomic_inc(&bfqq->ref); - *async_bfqq = bfqq; - } - - atomic_inc(&bfqq->ref); - return bfqq; -} - -/* - * We drop cfq io contexts lazily, so we may find a dead one. - */ -static void -bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc, - struct cfq_io_context *cic) -{ - unsigned long flags; - - WARN_ON(!list_empty(&cic->queue_list)); - - spin_lock_irqsave(&ioc->lock, flags); - - BUG_ON(ioc->ioc_data == cic); - - radix_tree_delete(&ioc->radix_root, (unsigned long)bfqd); - hlist_del_rcu(&cic->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - bfq_cic_free(cic); -} - -static struct cfq_io_context * -bfq_cic_lookup(struct bfq_data *bfqd, struct io_context *ioc) -{ - struct cfq_io_context *cic; - void *k; - - if (unlikely(ioc == NULL)) - return NULL; - - /* - * we maintain a last-hit cache, to avoid browsing over the tree - */ - cic = rcu_dereference(ioc->ioc_data); - if (cic != NULL && cic->key == bfqd) - return cic; - - do { - rcu_read_lock(); - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long)bfqd); - rcu_read_unlock(); - if (cic == NULL) - break; - /* ->key must be copied to avoid race with bfq_exit_queue() */ - k = cic->key; - if (unlikely(k == NULL)) { - bfq_drop_dead_cic(bfqd, ioc, cic); - continue; - } - - rcu_assign_pointer(ioc->ioc_data, cic); - break; - } while (1); - - return cic; -} - -/* - * Add cic into ioc, using bfqd as the search key. This enables us to lookup - * the process specific cfq io context when entered from the block layer. - * Also adds the cic to a per-bfqd list, used when this queue is removed. - */ -static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc, - struct cfq_io_context *cic, gfp_t gfp_mask) -{ - unsigned long flags; - int ret; - - ret = radix_tree_preload(gfp_mask); - if (ret == 0) { - cic->ioc = ioc; - cic->key = bfqd; - - spin_lock_irqsave(&ioc->lock, flags); - ret = radix_tree_insert(&ioc->radix_root, - (unsigned long)bfqd, cic); - if (ret == 0) - hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - radix_tree_preload_end(); - - if (ret == 0) { - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - list_add(&cic->queue_list, &bfqd->cic_list); - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - } - } - - if (ret != 0) - printk(KERN_ERR "bfq: cic link failed!\n"); - - return ret; -} - -/* - * Setup general io context and cfq io context. There can be several cfq - * io contexts per general io context, if this process is doing io to more - * than one device managed by cfq. - * Since bfq uses the same io contexts as cfq, we use the same tree to store - * either cfq and bfq contexts; the lookup is done using a bfqd/bfqd key, so - * we cannot have clashes and the key identifies the scheduler type too. - */ -static struct cfq_io_context * -bfq_get_io_context(struct bfq_data *bfqd, gfp_t gfp_mask) -{ - struct io_context *ioc = NULL; - struct cfq_io_context *cic; - - might_sleep_if(gfp_mask & __GFP_WAIT); - - ioc = get_io_context(gfp_mask, bfqd->queue->node); - if (ioc == NULL) - return NULL; - - cic = bfq_cic_lookup(bfqd, ioc); - if (cic != NULL) - goto out; - - cic = bfq_alloc_io_context(bfqd, gfp_mask); - if (cic == NULL) - goto err; - - if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0) - goto err_free; - -out: - if (unlikely(ioc->ioprio_changed)) { - /* pairs with wmb() in set_task_ioprio() in fs/ioprio.c */ - rmb(); - bfq_ioc_set_ioprio(ioc); - } - - return cic; -err_free: - bfq_cic_free(cic); -err: - put_io_context(ioc); - return NULL; -} - -static void -bfq_update_io_thinktime(struct bfq_data *bfqd, struct cfq_io_context *cic) -{ - unsigned long elapsed = jiffies - cic->last_end_request; - unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); - - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; -} - -static void -bfq_update_io_seektime(struct bfq_data *bfqd, struct cfq_io_context *cic, - struct request *rq) -{ - sector_t sdist; - u64 total; - - if (cic->last_request_pos < rq->sector) - sdist = rq->sector - cic->last_request_pos; - else - sdist = cic->last_request_pos - rq->sector; - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (cic->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); - - cic->seek_samples = (7*cic->seek_samples + 256) / 8; - cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; - total = cic->seek_total + (cic->seek_samples/2); - do_div(total, cic->seek_samples); - cic->seek_mean = (sector_t)total; -} - -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter - */ -static void -bfq_update_idle_window(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct cfq_io_context *cic) -{ - int enable_idle; - - /* - * Don't idle for async or idle io prio class - */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) - return; - - enable_idle = bfq_bfqq_idle_window(bfqq); - - if (atomic_read(&cic->ioc->nr_tasks) == 0 || - bfqd->bfq_slice_idle == 0 || (bfqd->hw_tag && CIC_SEEKY(cic))) - enable_idle = 0; - else if (sample_valid(cic->ttime_samples)) { - if (cic->ttime_mean > bfqd->bfq_slice_idle) - enable_idle = 0; - else - enable_idle = 1; - } - - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); -} - -/* - * Called when a new fs request (rq) is added (to bfqq). Check if there's - * something we should do about it - */ -static void -bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) -{ - struct cfq_io_context *cic = RQ_CIC(rq); - - if (rq_is_meta(rq)) - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, cic); - bfq_update_io_seektime(bfqd, cic, rq); - bfq_update_idle_window(bfqd, bfqq, cic); - - cic->last_request_pos = rq->sector + rq->nr_sectors; - - if (bfqq == bfqd->active_queue && bfq_bfqq_wait_request(bfqq)) { - /* - * If we are waiting for a request for this queue, let it rip - * immediately and flag that we must not expire this queue - * just now. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); - blk_start_queueing(bfqd->queue); - } -} - -static void bfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc); - - bfq_add_rq_rb(rq); - - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -} - -static void bfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - unsigned long now; - - now = jiffies; - - WARN_ON(!bfqd->rq_in_driver); - WARN_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight--; - - if (sync) - RQ_CIC(rq)->last_end_request = now; - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->active_queue == bfqq && sync && - RB_EMPTY_ROOT(&bfqq->sort_list)) - bfq_arm_slice_timer(bfqd); - - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); -} - -/* - * we temporarily boost lower priority queues if they are holding fs exclusive - * resources. they are boosted to normal prio (CLASS_BE/4) - */ -static void bfq_prio_boost(struct bfq_queue *bfqq) -{ - if (has_fs_excl()) { - /* - * boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (bfq_class_idle(bfqq)) - bfqq->ioprio_class = IOPRIO_CLASS_BE; - if (bfqq->ioprio > IOPRIO_NORM) - bfqq->ioprio = IOPRIO_NORM; - } else { - /* - * check if we need to unboost the queue - */ - if (bfqq->ioprio_class != bfqq->org_ioprio_class) - bfqq->ioprio_class = bfqq->org_ioprio_class; - if (bfqq->ioprio != bfqq->org_ioprio) - bfqq->ioprio = bfqq->org_ioprio; - } -} - -static inline int __bfq_may_queue(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { - bfq_clear_bfqq_must_alloc(bfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int bfq_may_queue(struct request_queue *q, int rw) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - /* - * don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be queued. - * so just lookup a possibly existing queue, or return 'may queue' - * if that fails - */ - cic = bfq_cic_lookup(bfqd, tsk->io_context); - if (cic == NULL) - return ELV_MQUEUE_MAY; - - bfqq = cic_to_bfqq(cic, rw & REQ_RW_SYNC); - if (bfqq != NULL) { - bfq_init_prio_data(bfqq, cic->ioc); - bfq_prio_boost(bfqq); - - return __bfq_may_queue(bfqq); - } - - return ELV_MQUEUE_MAY; -} - -/* - * queue lock held here - */ -static void bfq_put_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - if (bfqq != NULL) { - const int rw = rq_data_dir(rq); - - BUG_ON(!bfqq->allocated[rw]); - bfqq->allocated[rw]--; - - put_io_context(RQ_CIC(rq)->ioc); - - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - - bfq_put_queue(bfqq); - } -} - -/* - * Allocate cfq data structures associated with this request. - */ -static int -bfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; - const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - unsigned long flags; - - might_sleep_if(gfp_mask & __GFP_WAIT); - - cic = bfq_get_io_context(bfqd, gfp_mask); - - spin_lock_irqsave(q->queue_lock, flags); - - if (cic == NULL) - goto queue_fail; - - bfqq = cic_to_bfqq(cic, is_sync); - if (bfqq == NULL) { - bfqq = bfq_get_queue(bfqd, is_sync, cic->ioc, gfp_mask); - - if (bfqq == NULL) - goto queue_fail; - - cic_set_bfqq(cic, bfqq, is_sync); - } - - bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - - spin_unlock_irqrestore(q->queue_lock, flags); - - rq->elevator_private = cic; - rq->elevator_private2 = bfqq; - - return 0; - -queue_fail: - if (cic != NULL) - put_io_context(cic->ioc); - - bfq_schedule_dispatch(bfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 1; -} - -static void bfq_kick_queue(struct work_struct *work) -{ - struct bfq_data *bfqd = - container_of(work, struct bfq_data, unplug_work); - struct request_queue *q = bfqd->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_start_queueing(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -/* - * Timer running if the active_queue is currently idling inside its time slice - */ -static void bfq_idle_slice_timer(unsigned long data) -{ - struct bfq_data *bfqd = (struct bfq_data *)data; - struct bfq_queue *bfqq; - unsigned long flags; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - - bfqq = bfqd->active_queue; - /* - * Theoretical race here: active_queue can be NULL or different - * from the queue that was idling if the timer handler spins on - * the queue_lock and a new request arrives for the current - * queue and there is a full dispatch cycle that changes the - * active_queue. This can hardly happen, but in the worst case - * we just expire a queue too early. - */ - if (bfqq != NULL && bfq_bfqq_wait_request(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - bfq_bfqq_expire(bfqd, bfqq, 1); - bfq_schedule_dispatch(bfqd); - } - - BUG_ON(bfqq == NULL && bfqd->busy_queues != 0); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -} - -static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -{ - del_timer_sync(&bfqd->idle_slice_timer); - kblockd_flush_work(&bfqd->unplug_work); -} - -static void bfq_put_async_queues(struct bfq_data *bfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (bfqd->async_bfqq[0][i] != NULL) - bfq_put_queue(bfqd->async_bfqq[0][i]); - if (bfqd->async_bfqq[1][i] != NULL) - bfq_put_queue(bfqd->async_bfqq[1][i]); - } - - if (bfqd->async_idle_bfqq != NULL) - bfq_put_queue(bfqd->async_idle_bfqq); -} - -static void bfq_exit_queue(elevator_t *e) -{ - struct bfq_data *bfqd = e->elevator_data; - struct request_queue *q = bfqd->queue; - struct bfq_wfqdata *wfqd; - struct bfq_queue *bfqq, *next; - int i; - - bfq_shutdown_timer_wq(bfqd); - - spin_lock_irq(q->queue_lock); - - while (!list_empty(&bfqd->cic_list)) { - struct cfq_io_context *cic = list_entry(bfqd->cic_list.next, - struct cfq_io_context, - queue_list); - - __bfq_exit_single_io_context(bfqd, cic); - } - - bfq_put_async_queues(bfqd); - - BUG_ON(bfqd->active_queue != NULL); - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - wfqd = &bfqd->service_tree[i]; - BUG_ON(!RB_EMPTY_ROOT(&wfqd->active)); - bfqq = wfqd->first_idle; - while (bfqq != NULL) { - next = bfq_bfqq_of(rb_next(&bfqq->rb_node)); - bfq_put_idle_queue(wfqd, bfqq); - bfqq = next; - } - } - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - - kfree(bfqd); -} - -static void *bfq_init_queue(struct request_queue *q) -{ - struct bfq_data *bfqd; - int i; - - bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); - if (bfqd == NULL) - return NULL; - - INIT_LIST_HEAD(&bfqd->cic_list); - - bfqd->queue = q; - - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqd->service_tree[i] = BFQ_WFQDATA_INIT; - - init_timer(&bfqd->idle_slice_timer); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - bfqd->idle_slice_timer.data = (unsigned long)bfqd; - - INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); - - bfqd->bfq_quantum = bfq_quantum; - bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; - bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_async_rq = bfq_slice_async_rq; - bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_max_budget = bfq_max_budget; - - return bfqd; -} - -static void bfq_slab_kill(void) -{ - if (bfq_pool != NULL) - kmem_cache_destroy(bfq_pool); - if (bfq_ioc_pool != NULL) - kmem_cache_destroy(bfq_ioc_pool); -} - -static int __init bfq_slab_setup(void) -{ - bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (bfq_pool == NULL) - goto fail; - - bfq_ioc_pool = kmem_cache_create("bfq_io_context", - sizeof(struct cfq_io_context), - __alignof__(struct cfq_io_context), - 0, NULL); - if (bfq_ioc_pool == NULL) - goto fail; - - return 0; -fail: - bfq_slab_kill(); - return -ENOMEM; -} - -/* - * sysfs parts below --> - */ -static ssize_t -bfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static ssize_t -bfq_var_store(unsigned int *var, const char *page, size_t count) -{ - char *p = (char *)page; - - *var = simple_strtoul(p, &p, 10); - return count; -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(elevator_t *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return bfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -SHOW_FUNCTION(bfq_slice_async_rq_show, bfqd->bfq_slice_async_rq, 0); -SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_max_budget, 0); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data; \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ - return ret; \ -} -STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, UINT_MAX, 0); -STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - UINT_MAX, 1); -STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - UINT_MAX, 1); -STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, UINT_MAX, 0); -STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - UINT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(bfq_slice_async_rq_store, &bfqd->bfq_slice_async_rq, 1, - UINT_MAX, 0); -STORE_FUNCTION(bfq_max_budget_store, &bfqd->bfq_max_budget, 0, UINT_MAX, 0); -#undef STORE_FUNCTION - -#define BFQ_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) - -static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(quantum), - BFQ_ATTR(fifo_expire_sync), - BFQ_ATTR(fifo_expire_async), - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_async_rq), - BFQ_ATTR(slice_idle), - BFQ_ATTR(max_budget), - __ATTR_NULL -}; - -static struct elevator_type iosched_bfq = { - .ops = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, - .elevator_allow_merge_fn = bfq_allow_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, - .elevator_deactivate_req_fn = bfq_deactivate_request, - .elevator_queue_empty_fn = bfq_queue_empty, - .elevator_completed_req_fn = bfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_set_req_fn = bfq_set_request, - .elevator_put_req_fn = bfq_put_request, - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, - .trim = bfq_free_io_context, - }, - .elevator_attrs = bfq_attrs, - .elevator_name = "bfq", - .elevator_owner = THIS_MODULE, -}; - -static int __init bfq_init(void) -{ - /* - * could be 0 on HZ < 1000 setups - */ - if (bfq_slice_idle == 0) - bfq_slice_idle = 1; - - if (bfq_slab_setup()) - return -ENOMEM; - - elv_register(&iosched_bfq); - - return 0; -} - -static void __exit bfq_exit(void) -{ - DECLARE_COMPLETION_ONSTACK(all_gone); - elv_unregister(&iosched_bfq); - ioc_gone = &all_gone; - /* ioc_gone's update must be visible before reading ioc_count */ - smp_wmb(); - if (elv_ioc_count_read(ioc_count) != 0) - wait_for_completion(ioc_gone); - bfq_slab_kill(); -} - -module_init(bfq_init); -module_exit(bfq_exit); - -MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); diff -puN block/blk-core.c~revert-git-block block/blk-core.c --- a/block/blk-core.c~revert-git-block +++ a/block/blk-core.c @@ -26,6 +26,8 @@ #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> #include <linux/blktrace_api.h> #include <linux/fault-inject.h> @@ -48,6 +50,8 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + static void drive_stat_acct(struct request *rq, int new_io) { struct hd_struct *part; @@ -109,7 +113,7 @@ void blk_rq_init(struct request_queue *q memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); - rq->cpu = -1; + INIT_LIST_HEAD(&rq->donelist); rq->q = q; rq->sector = rq->hard_sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); @@ -178,11 +182,6 @@ void blk_dump_rq_flags(struct request *r } EXPORT_SYMBOL(blk_dump_rq_flags); -static inline int blk_is_io_cpu(struct request_queue *q) -{ - return cpu_isset(smp_processor_id(), q->queue_cpu); -} - /* * "plug" the device if there are no outstanding requests: this will * force the transfer to start only after we have put all the requests @@ -289,7 +288,7 @@ void blk_unplug_timeout(unsigned long da blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, q->rq.count[READ] + q->rq.count[WRITE]); - kblockd_schedule_work(q, &q->unplug_work); + kblockd_schedule_work(&q->unplug_work); } void blk_unplug(struct request_queue *q) @@ -306,22 +305,6 @@ void blk_unplug(struct request_queue *q) } EXPORT_SYMBOL(blk_unplug); -static void blk_invoke_request_fn(struct request_queue *q) -{ - /* - * one level of recursion is ok and is much faster than kicking - * the unplug handling - */ - if (blk_is_io_cpu(q) && - !test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } -} - /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question @@ -336,7 +319,19 @@ void blk_start_queue(struct request_queu WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); - blk_invoke_request_fn(q); + + /* + * one level of recursion is ok and is much faster than kicking + * the unplug handling + */ + if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { + queue_flag_set(QUEUE_FLAG_REENTER, q); + q->request_fn(q); + queue_flag_clear(QUEUE_FLAG_REENTER, q); + } else { + blk_plug_device(q); + kblockd_schedule_work(&q->unplug_work); + } } EXPORT_SYMBOL(blk_start_queue); @@ -390,8 +385,20 @@ void __blk_run_queue(struct request_queu { blk_remove_plug(q); - if (!elv_queue_empty(q)) - blk_invoke_request_fn(q); + /* + * Only recurse once to avoid overrunning the stack, let the unplug + * handling reinvoke the handler shortly if we already got there. + */ + if (!elv_queue_empty(q)) { + if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { + queue_flag_set(QUEUE_FLAG_REENTER, q); + q->request_fn(q); + queue_flag_clear(QUEUE_FLAG_REENTER, q); + } else { + blk_plug_device(q); + kblockd_schedule_work(&q->unplug_work); + } + } } EXPORT_SYMBOL(__blk_run_queue); @@ -462,8 +469,6 @@ struct request_queue *blk_alloc_queue_no if (!q) return NULL; - cpus_setall(q->queue_cpu); - cpus_setall(q->complete_cpu); q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; q->backing_dev_info.unplug_io_data = q; err = bdi_init(&q->backing_dev_info); @@ -867,10 +872,7 @@ EXPORT_SYMBOL(blk_get_request); */ void blk_start_queueing(struct request_queue *q) { - if (!blk_is_io_cpu(q)) { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } else if (!blk_queue_plugged(q)) + if (!blk_queue_plugged(q)) q->request_fn(q); else __generic_unplug_device(q); @@ -1182,15 +1184,13 @@ get_rq: init_request_from_bio(req, bio); spin_lock_irq(q->queue_lock); - if (q->queue_flags & (1 << QUEUE_FLAG_SAME_COMP) || - bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); if (elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); out: if (sync) __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); return 0; @@ -1622,6 +1622,82 @@ static int __end_that_request_first(stru } /* + * splice the completion data to a local structure and hand off to + * process_completion_queue() to complete the requests + */ +static void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = &__get_cpu_var(blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, donelist); + list_del_init(&rq->donelist); + rq->q->softirq_done_fn(rq); + } +} + +static int __cpuinit blk_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + &__get_cpu_var(blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + } + + return NOTIFY_OK; +} + + +static struct notifier_block blk_cpu_notifier __cpuinitdata = { + .notifier_call = blk_cpu_notify, +}; + +/** + * blk_complete_request - end I/O on a request + * @req: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions, + * unless the driver actually implements this in its completion callback + * through requeueing. The actual completion happens out-of-order, + * through a softirq handler. The user must have registered a completion + * callback through blk_queue_softirq_done(). + **/ + +void blk_complete_request(struct request *req) +{ + struct list_head *cpu_list; + unsigned long flags; + + BUG_ON(!req->q->softirq_done_fn); + + local_irq_save(flags); + + cpu_list = &__get_cpu_var(blk_cpu_done); + list_add_tail(&req->donelist, cpu_list); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_complete_request); + +/* * queue lock must be held */ static void end_that_request_last(struct request *req, int error) @@ -1938,18 +2014,9 @@ void blk_rq_bio_prep(struct request_queu rq->rq_disk = bio->bi_bdev->bd_disk; } -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { - int cpu; - - if (blk_is_io_cpu(q)) - return queue_work(kblockd_workqueue, work); - - /* - * would need to be improved, of course... - */ - cpu = first_cpu(q->queue_cpu); - return queue_work_on_cpu(kblockd_workqueue, work, cpu); + return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); @@ -1961,6 +2028,8 @@ EXPORT_SYMBOL(kblockd_flush_work); int __init blk_dev_init(void) { + int i; + kblockd_workqueue = create_workqueue("kblockd"); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); @@ -1971,6 +2040,12 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); + register_hotcpu_notifier(&blk_cpu_notifier); + return 0; } diff -puN block/blk-ioc.c~revert-git-block block/blk-ioc.c --- a/block/blk-ioc.c~revert-git-block +++ a/block/blk-ioc.c @@ -15,12 +15,13 @@ */ static struct kmem_cache *iocontext_cachep; -static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list) +static void cfq_dtor(struct io_context *ioc) { - if (!hlist_empty(list)) { + if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(list->first, struct cfq_io_context, cic_list); + cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic_list); cic->dtor(ioc); } } @@ -40,9 +41,7 @@ int put_io_context(struct io_context *io rcu_read_lock(); if (ioc->aic && ioc->aic->dtor) ioc->aic->dtor(ioc->aic); - - hlist_sched_dtor(ioc, &ioc->cic_list); - hlist_sched_dtor(ioc, &ioc->bfq_cic_list); + cfq_dtor(ioc); rcu_read_unlock(); kmem_cache_free(iocontext_cachep, ioc); @@ -52,18 +51,18 @@ int put_io_context(struct io_context *io } EXPORT_SYMBOL(put_io_context); -static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list) +static void cfq_exit(struct io_context *ioc) { rcu_read_lock(); - if (!hlist_empty(list)) { + if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = list_entry(list->first, struct cfq_io_context, cic_list); + cic = list_entry(ioc->cic_list.first, struct cfq_io_context, + cic_list); cic->exit(ioc); } rcu_read_unlock(); - } /* Called by the exitting task */ @@ -79,8 +78,7 @@ void exit_io_context(void) if (atomic_dec_and_test(&ioc->nr_tasks)) { if (ioc->aic && ioc->aic->exit) ioc->aic->exit(ioc->aic); - hlist_sched_exit(ioc, &ioc->cic_list); - hlist_sched_exit(ioc, &ioc->bfq_cic_list); + cfq_exit(ioc); put_io_context(ioc); } @@ -102,7 +100,6 @@ struct io_context *alloc_io_context(gfp_ ret->aic = NULL; INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); - INIT_HLIST_HEAD(&ret->bfq_cic_list); ret->ioc_data = NULL; } diff -puN block/blk-settings.c~revert-git-block block/blk-settings.c --- a/block/blk-settings.c~revert-git-block +++ a/block/blk-settings.c @@ -404,43 +404,7 @@ void blk_queue_update_dma_alignment(stru } EXPORT_SYMBOL(blk_queue_update_dma_alignment); -/** - * blk_queue_set_completion_cpu - Set IO CPU for completions - * @q: the request queue for the device - * @mask: mask of allowed CPUs - * - * Description: - * This function allows a driver to set a CPU that should handle completions - * for this device. - * - **/ -int blk_queue_set_completion_cpu(struct request_queue *q, cpumask_t mask) -{ - cpus_setall(q->complete_cpu); - cpus_and(q->complete_cpu, q->complete_cpu, mask); - return 0; -} -EXPORT_SYMBOL(blk_queue_set_completion_cpu); - -/** - * blk_queue_set_queue_cpu - Set IO CPU for queuing - * @q: the request queue for the device - * @mask: mask of allowed CPUs - * - * Description: - * This function allows a driver to set a CPU that should handle queuing - * for this device. - * - **/ -int blk_queue_set_queue_cpu(struct request_queue *q, cpumask_t mask) -{ - cpus_setall(q->queue_cpu); - cpus_and(q->queue_cpu, q->queue_cpu, mask); - return 0; -} -EXPORT_SYMBOL(blk_queue_set_queue_cpu); - -int __init blk_settings_init(void) +static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; blk_max_pfn = max_pfn - 1; diff -puN block/blk-softirq.c~revert-git-block /dev/null --- a/block/blk-softirq.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Functions related to softirq rq completions - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> - -#include "blk.h" - -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = &__get_cpu_var(blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, csd.list); - list_del_init(&rq->csd.list); - rq->q->softirq_done_fn(rq); - } -} - -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) -{ - struct request *rq = data; - unsigned long flags; - struct list_head *list; - - local_irq_save(flags); - list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&rq->csd.list, list); - - if (list->next == &rq->csd.list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - - local_irq_restore(flags); -} - -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - struct call_single_data *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - __smp_call_function_single(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - /* - * We can never get here on UP - */ - BUG(); - return 1; -} -#endif - -static int __cpuinit blk_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - &__get_cpu_var(blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata blk_cpu_notifier = { - .notifier_call = blk_cpu_notify, -}; - -/** - * blk_complete_request - end I/O on a request - * @req: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions, - * unless the driver actually implements this in its completion callback - * through requeueing. The actual completion happens out-of-order, - * through a softirq handler. The user must have registered a completion - * callback through blk_queue_softirq_done(). - **/ -void blk_complete_request(struct request *req) -{ - struct request_queue *q = req->q; - unsigned long flags; - int ccpu, cpu, group_cpu; - - BUG_ON(!q->softirq_done_fn); - - local_irq_save(flags); - cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); - - /* - * Select completion CPU - */ - if ((q->queue_flags & (1 << QUEUE_FLAG_SAME_COMP)) && req->cpu != -1) - ccpu = req->cpu; - else if (cpu_isset(cpu, q->complete_cpu)) - ccpu = cpu; - else - ccpu = first_cpu(q->complete_cpu); - - if (ccpu == cpu || ccpu == group_cpu) { - struct list_head *list; -do_local: - list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&req->csd.list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->csd.list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; - - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_complete_request); - -__init int blk_softirq_init(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); - register_hotcpu_notifier(&blk_cpu_notifier); - return 0; -} -subsys_initcall(blk_softirq_init); diff -puN block/blk-sysfs.c~revert-git-block block/blk-sysfs.c --- a/block/blk-sysfs.c~revert-git-block +++ a/block/blk-sysfs.c @@ -156,83 +156,6 @@ static ssize_t queue_nomerges_store(stru return ret; } -static ssize_t queue_complete_affinity_show(struct request_queue *q, char *page) -{ - ssize_t len = cpumask_scnprintf(page, PAGE_SIZE, q->complete_cpu); - - len += sprintf(page + len, "\n"); - return len; -} - -/* - * Pass in multiple CPUs with: - * # echo 0,1,2 > completion_affinity - */ -static ssize_t queue_complete_affinity_store(struct request_queue *q, - const char *page, size_t count) -{ - cpumask_t mask; - int ret; - - cpus_clear(mask); - ret = cpulist_parse(page, mask); - if (ret < 0) - return ret; - - spin_lock_irq(q->queue_lock); - blk_queue_set_completion_cpu(q, mask); - spin_unlock_irq(q->queue_lock); - return count; -} - -static ssize_t queue_queue_affinity_show(struct request_queue *q, char *page) -{ - ssize_t len = cpumask_scnprintf(page, PAGE_SIZE, q->queue_cpu); - - len += sprintf(page + len, "\n"); - return len; -} - -static ssize_t queue_queue_affinity_store(struct request_queue *q, - const char *page, size_t count) -{ - cpumask_t mask; - int ret; - - cpus_clear(mask); - ret = cpulist_parse(page, mask); - if (ret < 0) - return ret; - - spin_lock_irq(q->queue_lock); - blk_queue_set_queue_cpu(q, mask); - spin_unlock_irq(q->queue_lock); - return count; -} - -static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) -{ - unsigned int same = (q->queue_flags & 1 << (QUEUE_FLAG_SAME_COMP)) != 0; - - return queue_var_show(same, page); -} - -static ssize_t -queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) -{ - unsigned long val; - ssize_t ret; - - ret = queue_var_store(&val, page, count); - spin_lock_irq(q->queue_lock); - if (val) - q->queue_flags |= (1 << QUEUE_FLAG_SAME_COMP); - else - q->queue_flags &= ~(1 << QUEUE_FLAG_SAME_COMP); - spin_unlock_irq(q->queue_lock); - - return ret; -} static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, @@ -274,24 +197,6 @@ static struct queue_sysfs_entry queue_no .store = queue_nomerges_store, }; -static struct queue_sysfs_entry queue_complete_affinity_entry = { - .attr = {.name = "completion_affinity", .mode = S_IRUGO | S_IWUSR }, - .show = queue_complete_affinity_show, - .store = queue_complete_affinity_store, -}; - -static struct queue_sysfs_entry queue_queue_affinity_entry = { - .attr = {.name = "queue_affinity", .mode = S_IRUGO | S_IWUSR }, - .show = queue_queue_affinity_show, - .store = queue_queue_affinity_store, -}; - -static struct queue_sysfs_entry queue_rq_affinity_entry = { - .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR }, - .show = queue_rq_affinity_show, - .store = queue_rq_affinity_store, -}; - static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -300,9 +205,6 @@ static struct attribute *default_attrs[] &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_nomerges_entry.attr, - &queue_complete_affinity_entry.attr, - &queue_queue_affinity_entry.attr, - &queue_rq_affinity_entry.attr, NULL, }; diff -puN block/blk.h~revert-git-block block/blk.h --- a/block/blk.h~revert-git-block +++ a/block/blk.h @@ -51,16 +51,4 @@ static inline int queue_congestion_off_t return q->nr_congestion_off; } -static inline int blk_cpu_to_group(int cpu) -{ -#ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - return first_cpu(mask); -#elif defined(CONFIG_SCHED_SMT) - return first_cpu(per_cpu(cpu_sibling_map, cpu)); -#else - return cpu; -#endif -} - #endif diff -puN block/cfq-iosched.c~revert-git-block block/cfq-iosched.c --- a/block/cfq-iosched.c~revert-git-block +++ a/block/cfq-iosched.c @@ -235,7 +235,7 @@ static inline int cfq_bio_sync(struct bi static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { if (cfqd->busy_queues) - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_work(&cfqd->unplug_work); } static int cfq_queue_empty(struct request_queue *q) diff -puN fs/ioprio.c~revert-git-block fs/ioprio.c --- a/fs/ioprio.c~revert-git-block +++ a/fs/ioprio.c @@ -58,8 +58,6 @@ static int set_task_ioprio(struct task_s if (!err) { ioc->ioprio = ioprio; - /* make sure schedulers see the new ioprio value */ - wmb(); ioc->ioprio_changed = 1; } diff -puN fs/splice.c~revert-git-block fs/splice.c --- a/fs/splice.c~revert-git-block +++ a/fs/splice.c @@ -29,7 +29,6 @@ #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/security.h> -#include <linux/mman.h> /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1178,223 +1177,6 @@ static int copy_from_user_mmap_sem(void } /* - * Just copy the data to user space - */ -static int pipe_to_user_copy(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, struct splice_desc *sd) -{ - char *src; - int ret; - - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - buf->ops->unmap(pipe, buf, src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = buf->ops->map(pipe, buf, 0); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - buf->ops->unmap(pipe, buf, src); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; -} - -/* - * This actor doesn't really do anything interesting, it merely settles - * the pipe page and adds it to the work list for insertion when the entire - * pipe has been processed. - */ -static int pipe_to_user_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, struct splice_desc *sd) -{ - struct splice_pipe_desc *spd = sd->u.data; - int error; - - if (buf->len & ~PAGE_MASK) - return -EINVAL; - - error = buf->ops->confirm(pipe, buf); - if (!error) { - spd->pages[spd->nr_pages++] = buf->page; - return buf->len; - } - - return error; -} - -/* - * Setup a vma for this address range, and let pipe_to_user_map() insert - * pages into that. - */ -static int vmsplice_pipe_map(struct pipe_inode_info *pipe, - struct splice_desc *sd) -{ - struct mm_struct *mm = current->mm; - struct page *pages[PIPE_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - }; - struct vm_area_struct *vma; - unsigned long addr; - int ret, i, err; - - if (sd->total_len & ~PAGE_MASK) - return -EINVAL; - - /* - * Run through the pipe buffers and settle the contents. The number - * of processed pages will be put in spd.nr_pages. - */ - addr = (unsigned long) sd->u.userptr; - sd->pos = 0; - sd->u.data = &spd; - err = __splice_from_pipe(pipe, sd, pipe_to_user_map); - if (unlikely(err <= 0)) - return err; - else if (unlikely(!spd.nr_pages)) - return 0; - - /* - * We have a non-zero number of pages available. Now find the - * associated vma so we can establish pages mappings there. - */ - ret = -EINVAL; - down_read(&mm->mmap_sem); - - vma = find_vma(mm, addr); - if (unlikely(!vma)) - goto out; - - for (i = ret = err = 0; i < spd.nr_pages; i++) { - err = vm_insert_page(vma, addr, spd.pages[i]); - if (unlikely(err)) - break; - - addr += PAGE_SIZE; - ret += PAGE_SIZE; - } - -out: - up_read(&mm->mmap_sem); - - if (err && !ret) - ret = err; - - return ret; -} - -/* - * vmsplice a pipe to user memory. If SPLICE_F_MOVE is set, we will attempt - * to move the pipe pages to the user address space. Otherwise a simple - * copy is done. - */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) -{ - struct pipe_inode_info *pipe; - struct splice_desc sd; - long spliced, ret; - - pipe = pipe_info(file->f_path.dentry->d_inode); - if (!pipe) - return -EBADF; - - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); - - spliced = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - ret = get_user(base, &iov->iov_base); - if (unlikely(ret)) - break; - ret = get_user(len, &iov->iov_len); - if (unlikely(ret)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - ret = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - ret = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - /* - * SPLICE_F_MOVE is set, don't copy the data but attempt - * to map it into the app address space. - */ - if (flags & SPLICE_F_MOVE) - ret = vmsplice_pipe_map(pipe, &sd); - else - ret = __splice_from_pipe(pipe, &sd, pipe_to_user_copy); - - if (ret < 0) - break; - - spliced += ret; - - /* - * If we transferred less than a pipe buffer length, break - * out of the loop and let the caller retry. - */ - if (ret < len) - break; - - nr_segs--; - iov++; - } - - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); - - if (!spliced) - spliced = ret; - - return spliced; -} - -/* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into * our ones pages[] map instead of splitting that operation into pieces. @@ -1500,6 +1282,129 @@ static int get_iovec_page_array(const st return error; } +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + char *src; + int ret; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) + return ret; + + /* + * See if we can use the atomic maps, by prefaulting in the + * pages and doing an atomic copy + */ + if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { + src = buf->ops->map(pipe, buf, 1); + ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, + sd->len); + buf->ops->unmap(pipe, buf, src); + if (!ret) { + ret = sd->len; + goto out; + } + } + + /* + * No dice, use slow non-atomic map and copy + */ + src = buf->ops->map(pipe, buf, 0); + + ret = sd->len; + if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) + ret = -EFAULT; + + buf->ops->unmap(pipe, buf, src); +out: + if (ret > 0) + sd->u.userptr += ret; + return ret; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + ssize_t size; + int error; + long ret; + + pipe = pipe_info(file->f_path.dentry->d_inode); + if (!pipe) + return -EBADF; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + error = ret = 0; + while (nr_segs) { + void __user *base; + size_t len; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + if (unlikely(!base)) { + error = -EFAULT; + break; + } + + if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { + error = -EFAULT; + break; + } + + sd.len = 0; + sd.total_len = len; + sd.flags = flags; + sd.u.userptr = base; + sd.pos = 0; + + size = __splice_from_pipe(pipe, &sd, pipe_to_user); + if (size < 0) { + if (!ret) + ret = size; + + break; + } + + ret += size; + + if (size < len) + break; + + nr_segs--; + iov++; + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (!ret) + ret = error; + + return ret; +} + /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or diff -puN include/asm-alpha/smp.h~revert-git-block include/asm-alpha/smp.h --- a/include/asm-alpha/smp.h~revert-git-block +++ a/include/asm-alpha/smp.h @@ -47,6 +47,8 @@ extern struct cpuinfo_alpha cpu_data[NR_ extern int smp_num_cpus; #define cpu_possible_map cpu_present_map +int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, cpumask_t cpu); + #else /* CONFIG_SMP */ #define hard_smp_processor_id() 0 diff -puN include/asm-ia64/smp.h~revert-git-block include/asm-ia64/smp.h --- a/include/asm-ia64/smp.h~revert-git-block +++ a/include/asm-ia64/smp.h @@ -38,6 +38,9 @@ ia64_get_lid (void) return lid.f.id << 8 | lid.f.eid; } +extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); + #define hard_smp_processor_id() ia64_get_lid() #ifdef CONFIG_SMP diff -puN include/asm-m32r/smp.h~revert-git-block include/asm-m32r/smp.h --- a/include/asm-m32r/smp.h~revert-git-block +++ a/include/asm-m32r/smp.h @@ -104,7 +104,6 @@ extern unsigned long send_IPI_mask_phys( #define LOCAL_TIMER_IPI (M32R_IRQ_IPI3-M32R_IRQ_IPI0) #define INVALIDATE_CACHE_IPI (M32R_IRQ_IPI4-M32R_IRQ_IPI0) #define CPU_BOOT_IPI (M32R_IRQ_IPI5-M32R_IRQ_IPI0) -#define CALL_FUNC_SINGLE_IPI (M32R_IRQ_IPI6-M32R_IRQ_IPI0) #define IPI_SHIFT (0) #define NR_IPIS (8) diff -puN include/asm-mips/smp.h~revert-git-block include/asm-mips/smp.h --- a/include/asm-mips/smp.h~revert-git-block +++ a/include/asm-mips/smp.h @@ -35,6 +35,16 @@ extern int __cpu_logical_map[NR_CPUS]; #define NO_PROC_ID (-1) +struct call_data_struct { + void (*func)(void *); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +extern struct call_data_struct *call_data; + #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */ #define SMP_CALL_FUNCTION 0x2 diff -puN include/asm-powerpc/smp.h~revert-git-block include/asm-powerpc/smp.h --- a/include/asm-powerpc/smp.h~revert-git-block +++ a/include/asm-powerpc/smp.h @@ -67,7 +67,10 @@ DECLARE_PER_CPU(cpumask_t, cpu_sibling_m * in /proc/interrupts will be wrong!!! --Troy */ #define PPC_MSG_CALL_FUNCTION 0 #define PPC_MSG_RESCHEDULE 1 -#define PPC_MSG_CALL_FUNC_SINGLE 2 +/* This is unused now */ +#if 0 +#define PPC_MSG_MIGRATE_TASK 2 +#endif #define PPC_MSG_DEBUGGER_BREAK 3 void smp_init_iSeries(void); diff -puN include/asm-sh/smp.h~revert-git-block include/asm-sh/smp.h --- a/include/asm-sh/smp.h~revert-git-block +++ a/include/asm-sh/smp.h @@ -26,10 +26,18 @@ extern int __cpu_logical_map[NR_CPUS]; #define NO_PROC_ID (-1) +struct smp_fn_call_struct { + spinlock_t lock; + atomic_t finished; + void (*fn)(void *); + void *data; +}; + +extern struct smp_fn_call_struct smp_fn_call; + #define SMP_MSG_FUNCTION 0 #define SMP_MSG_RESCHEDULE 1 -#define SMP_MSG_FUNCTION_SINGLE 2 -#define SMP_MSG_NR 3 +#define SMP_MSG_NR 2 void plat_smp_setup(void); void plat_prepare_cpus(unsigned int max_cpus); diff -puN include/asm-x86/hw_irq_32.h~revert-git-block include/asm-x86/hw_irq_32.h --- a/include/asm-x86/hw_irq_32.h~revert-git-block +++ a/include/asm-x86/hw_irq_32.h @@ -32,7 +32,6 @@ extern void (*const interrupt[NR_IRQS])( void reschedule_interrupt(void); void invalidate_interrupt(void); void call_function_interrupt(void); -void call_function_single_interrupt(void); #endif #ifdef CONFIG_X86_LOCAL_APIC diff -puN include/asm-x86/hw_irq_64.h~revert-git-block include/asm-x86/hw_irq_64.h --- a/include/asm-x86/hw_irq_64.h~revert-git-block +++ a/include/asm-x86/hw_irq_64.h @@ -68,7 +68,6 @@ #define ERROR_APIC_VECTOR 0xfe #define RESCHEDULE_VECTOR 0xfd #define CALL_FUNCTION_VECTOR 0xfc -#define CALL_FUNCTION_SINGLE_VECTOR 0xfb /* fb free - please don't readd KDB here because it's useless (hint - think what a NMI bit does to a vector) */ #define THERMAL_APIC_VECTOR 0xfa @@ -103,7 +102,6 @@ void spurious_interrupt(void); void error_interrupt(void); void reschedule_interrupt(void); void call_function_interrupt(void); -void call_function_single_interrupt(void); void irq_move_cleanup_interrupt(void); void invalidate_interrupt0(void); void invalidate_interrupt1(void); diff -puN include/asm-x86/mach-default/entry_arch.h~revert-git-block include/asm-x86/mach-default/entry_arch.h --- a/include/asm-x86/mach-default/entry_arch.h~revert-git-block +++ a/include/asm-x86/mach-default/entry_arch.h @@ -13,7 +13,6 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) -BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) #endif /* diff -puN include/asm-x86/mach-default/irq_vectors.h~revert-git-block include/asm-x86/mach-default/irq_vectors.h --- a/include/asm-x86/mach-default/irq_vectors.h~revert-git-block +++ a/include/asm-x86/mach-default/irq_vectors.h @@ -48,7 +48,6 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb -#define CALL_FUNCTION_SINGLE_VECTOR 0xfa #define THERMAL_APIC_VECTOR 0xf0 /* diff -puN include/asm-x86/mach-voyager/entry_arch.h~revert-git-block include/asm-x86/mach-voyager/entry_arch.h --- a/include/asm-x86/mach-voyager/entry_arch.h~revert-git-block +++ a/include/asm-x86/mach-voyager/entry_arch.h @@ -23,4 +23,4 @@ BUILD_INTERRUPT(qic_invalidate_interrupt BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI); BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI); BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI); -BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI); + diff -puN include/asm-x86/mach-voyager/irq_vectors.h~revert-git-block include/asm-x86/mach-voyager/irq_vectors.h --- a/include/asm-x86/mach-voyager/irq_vectors.h~revert-git-block +++ a/include/asm-x86/mach-voyager/irq_vectors.h @@ -33,7 +33,6 @@ #define VIC_RESCHEDULE_CPI 4 #define VIC_ENABLE_IRQ_CPI 5 #define VIC_CALL_FUNCTION_CPI 6 -#define VIC_CALL_FUNCTION_SINGLE_CPI 7 /* Now the QIC CPIs: Since we don't need the two initial levels, * these are 2 less than the VIC CPIs */ @@ -43,10 +42,9 @@ #define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) #define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) #define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) -#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET) #define VIC_START_FAKE_CPI VIC_TIMER_CPI -#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI +#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_CPI /* this is the SYS_INT CPI. */ #define VIC_SYS_INT 8 diff -puN include/asm-x86/smp.h~revert-git-block include/asm-x86/smp.h --- a/include/asm-x86/smp.h~revert-git-block +++ a/include/asm-x86/smp.h @@ -59,9 +59,9 @@ struct smp_ops { void (*smp_send_stop)(void); void (*smp_send_reschedule)(int cpu); - - void (*send_call_func_ipi)(cpumask_t mask); - void (*send_call_func_single_ipi)(int cpu); + int (*smp_call_function_mask)(cpumask_t mask, + void (*func)(void *info), void *info, + int wait); }; /* Globals due to paravirt */ @@ -103,22 +103,17 @@ static inline void smp_send_reschedule(i smp_ops.smp_send_reschedule(cpu); } -static inline void arch_send_call_function_single_ipi(int cpu) -{ - smp_ops.send_call_func_single_ipi(cpu); -} - -static inline void arch_send_call_function_ipi(cpumask_t mask) +static inline int smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait) { - smp_ops.send_call_func_ipi(mask); + return smp_ops.smp_call_function_mask(mask, func, info, wait); } void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int native_cpu_up(unsigned int cpunum); -void native_send_call_func_ipi(cpumask_t mask); -void native_send_call_func_single_ipi(int cpu); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); diff -puN include/asm-x86/xen/events.h~revert-git-block include/asm-x86/xen/events.h --- a/include/asm-x86/xen/events.h~revert-git-block +++ a/include/asm-x86/xen/events.h @@ -4,7 +4,6 @@ enum ipi_vector { XEN_RESCHEDULE_VECTOR, XEN_CALL_FUNCTION_VECTOR, - XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_NR_IPIS, }; diff -puN include/linux/bio.h~revert-git-block include/linux/bio.h --- a/include/linux/bio.h~revert-git-block +++ a/include/linux/bio.h @@ -127,7 +127,6 @@ struct bio { #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ -#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* diff -puN include/linux/blkdev.h~revert-git-block include/linux/blkdev.h --- a/include/linux/blkdev.h~revert-git-block +++ a/include/linux/blkdev.h @@ -17,7 +17,6 @@ #include <linux/module.h> #include <linux/stringify.h> #include <linux/bsg.h> -#include <linux/smp.h> #include <asm/scatterlist.h> @@ -146,8 +145,7 @@ enum rq_flag_bits { */ struct request { struct list_head queuelist; - struct call_single_data csd; - int cpu; + struct list_head donelist; struct request_queue *q; @@ -300,11 +298,8 @@ struct request_queue unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; - dma_drain_needed_fn *dma_drain_needed; - softirq_done_fn *softirq_done_fn; - cpumask_t queue_cpu; - cpumask_t complete_cpu; + dma_drain_needed_fn *dma_drain_needed; /* * Dispatch queue sorting @@ -414,7 +409,6 @@ struct request_queue #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ static inline int queue_is_locked(struct request_queue *q) { @@ -761,8 +755,6 @@ extern void blk_queue_segment_boundary(s extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); -extern int blk_queue_set_queue_cpu(struct request_queue *, cpumask_t); -extern int blk_queue_set_completion_cpu(struct request_queue *, cpumask_t); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); @@ -864,7 +856,7 @@ static inline void put_dev_sector(Sector } struct work_struct; -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); +int kblockd_schedule_work(struct work_struct *work); void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ diff -puN include/linux/elevator.h~revert-git-block include/linux/elevator.h --- a/include/linux/elevator.h~revert-git-block +++ a/include/linux/elevator.h @@ -173,15 +173,15 @@ enum { #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) /* - * Hack to reuse the csd.list list_head as the fifo time holder while + * Hack to reuse the donelist list_head as the fifo time holder while * the request is in the io scheduler. Saves an unsigned long in rq. */ -#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next) -#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp)) +#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) +#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) do { \ list_del_init(&(rq)->queuelist); \ - INIT_LIST_HEAD(&(rq)->csd.list); \ + INIT_LIST_HEAD(&(rq)->donelist); \ } while (0) /* diff -puN include/linux/iocontext.h~revert-git-block include/linux/iocontext.h --- a/include/linux/iocontext.h~revert-git-block +++ a/include/linux/iocontext.h @@ -30,11 +30,12 @@ struct as_io_context { sector_t seek_mean; }; +struct cfq_queue; struct cfq_io_context { void *key; unsigned long dead_key; - void *cfqq[2]; + struct cfq_queue *cfqq[2]; struct io_context *ioc; @@ -81,7 +82,6 @@ struct io_context { struct as_io_context *aic; struct radix_tree_root radix_root; struct hlist_head cic_list; - struct hlist_head bfq_cic_list; void *ioc_data; }; diff -puN include/linux/smp.h~revert-git-block include/linux/smp.h --- a/include/linux/smp.h~revert-git-block +++ a/include/linux/smp.h @@ -7,19 +7,9 @@ */ #include <linux/errno.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> extern void cpu_idle(void); -struct call_single_data { - struct list_head list; - void (*func) (void *info); - void *info; - unsigned int flags; -}; - #ifdef CONFIG_SMP #include <linux/preempt.h> @@ -63,23 +53,9 @@ extern void smp_cpus_done(unsigned int m * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); -int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, - int wait); + int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int retry, int wait); -void __smp_call_function_single(int cpuid, struct call_single_data *data); - -/* - * Generic and arch helpers - */ -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -void generic_smp_call_function_single_interrupt(void); -void generic_smp_call_function_interrupt(void); -void init_call_single_data(void); -void arch_send_call_function_single_ipi(int cpu); -void arch_send_call_function_ipi(cpumask_t mask); -extern spinlock_t call_function_lock; -#endif /* * Call a function on all processors @@ -136,9 +112,7 @@ static inline void smp_send_reschedule(i }) #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) -static inline void init_call_single_data(void) -{ -} + #endif /* !SMP */ /* diff -puN include/linux/workqueue.h~revert-git-block include/linux/workqueue.h --- a/include/linux/workqueue.h~revert-git-block +++ a/include/linux/workqueue.h @@ -181,8 +181,6 @@ extern void destroy_workqueue(struct wor extern int queue_work(struct workqueue_struct *wq, struct work_struct *work); extern int queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); -extern int queue_work_on_cpu(struct workqueue_struct *wq, - struct work_struct *work, int cpu); extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); diff -puN init/main.c~revert-git-block init/main.c --- a/init/main.c~revert-git-block +++ a/init/main.c @@ -31,7 +31,6 @@ #include <linux/kernel_stat.h> #include <linux/start_kernel.h> #include <linux/security.h> -#include <linux/smp.h> #include <linux/workqueue.h> #include <linux/profile.h> #include <linux/rcupdate.h> @@ -778,7 +777,6 @@ static void __init do_pre_smp_initcalls( { extern int spawn_ksoftirqd(void); - init_call_single_data(); migration_init(); spawn_ksoftirqd(); if (!nosoftlockup) diff -puN kernel/Makefile~revert-git-block kernel/Makefile --- a/kernel/Makefile~revert-git-block +++ a/kernel/Makefile @@ -35,7 +35,6 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff -puN kernel/smp.c~revert-git-block /dev/null --- a/kernel/smp.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Generic helpers for smp ipi calls - * - * (C) Jens Axboe <jens.axboe@xxxxxxxxxx> 2008 - * - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/rcupdate.h> -#include <linux/smp.h> - -static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); -static LIST_HEAD(call_function_queue); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); - -enum { - CSD_FLAG_WAIT = 0x01, - CSD_FLAG_ALLOC = 0x02, -}; - -struct call_function_data { - struct call_single_data csd; - spinlock_t lock; - unsigned int refs; - cpumask_t cpumask; - struct rcu_head rcu_head; -}; - -struct call_single_queue { - struct list_head list; - spinlock_t lock; -}; - -void __cpuinit init_call_single_data(void) -{ - int i; - - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } -} - -static void csd_flag_wait(struct call_single_data *data) -{ - /* Wait for response */ - do { - /* - * We need to see the flags store in the IPI handler - */ - smp_mb(); - if (!(data->flags & CSD_FLAG_WAIT)) - break; - cpu_relax(); - } while (1); -} - -/* - * Insert a previously allocated call_single_data element for execution - * on the given CPU. data must already have ->func, ->info, and ->flags set. - */ -static void generic_exec_single(int cpu, struct call_single_data *data) -{ - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - int wait = data->flags & CSD_FLAG_WAIT, ipi; - unsigned long flags; - - spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); - - if (ipi) - arch_send_call_function_single_ipi(cpu); - - if (wait) - csd_flag_wait(data); -} - -static void rcu_free_call_data(struct rcu_head *head) -{ - struct call_function_data *data; - - data = container_of(head, struct call_function_data, rcu_head); - - kfree(data); -} - -/* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = get_cpu(); - - /* - * It's ok to use list_for_each_rcu() here even though we may delete - * 'pos', since list_del_rcu() doesn't clear ->next - */ - rcu_read_lock(); - list_for_each_entry_rcu(data, &call_function_queue, csd.list) { - int refs; - - if (!cpu_isset(cpu, data->cpumask)) - continue; - - data->csd.func(data->csd.info); - - spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); - WARN_ON(data->refs == 0); - data->refs--; - refs = data->refs; - spin_unlock(&data->lock); - - if (refs) - continue; - - spin_lock(&call_function_lock); - list_del_rcu(&data->csd.list); - spin_unlock(&call_function_lock); - - if (data->csd.flags & CSD_FLAG_WAIT) { - /* - * serialize stores to data with the flag clear - * and wakeup - */ - smp_wmb(); - data->csd.flags &= ~CSD_FLAG_WAIT; - } else - call_rcu(&data->rcu_head, rcu_free_call_data); - } - rcu_read_unlock(); - - put_cpu(); -} - -/* - * Invoked by arch to handle an IPI for call function single. Must be called - * from the arch with interrupts disabled. - */ -void generic_smp_call_function_single_interrupt(void) -{ - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); - - /* - * Need to see other stores to list head for checking whether - * list is empty without holding q->lock - */ - smp_mb(); - while (!list_empty(&q->list)) { - unsigned int data_flags; - - spin_lock(&q->lock); - list_replace_init(&q->list, &list); - spin_unlock(&q->lock); - - while (!list_empty(&list)) { - struct call_single_data *data; - - data = list_entry(list.next, struct call_single_data, - list); - list_del(&data->list); - - /* - * 'data' can be invalid after this call if - * flags == 0 (when called through - * generic_exec_single(), so save them away before - * making the call. - */ - data_flags = data->flags; - - data->func(data->info); - - if (data_flags & CSD_FLAG_WAIT) { - smp_wmb(); - data->flags &= ~CSD_FLAG_WAIT; - } else if (data_flags & CSD_FLAG_ALLOC) - kfree(data); - } - /* - * See comment on outer loop - */ - smp_mb(); - } -} - -/* - * smp_call_function_single - Run a function on a specific CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @retry: Unused - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int retry, int wait) -{ - struct call_single_data d; - unsigned long flags; - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - if (cpu == me) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else { - struct call_single_data *data = NULL; - - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->flags = CSD_FLAG_ALLOC; - } - if (!data) { - data = &d; - data->flags = CSD_FLAG_WAIT; - } - - data->func = func; - data->info = info; - generic_exec_single(cpu, data); - } - - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - -/** - * __smp_call_function_single(): Run a function on another CPU - * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure - * - * Like smp_call_function_single(), but allow caller to pass in a pre-allocated - * data structure. Useful for embedding @data inside other structures, for - * instance. - * - */ -void __smp_call_function_single(int cpu, struct call_single_data *data) -{ - /* Can deadlock when called with interrupts disabled */ - WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); - - generic_exec_single(cpu, data); -} - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) -{ - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; - unsigned long flags; - int cpu, num_cpus; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, 0, wait); - } - - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->csd.flags = CSD_FLAG_ALLOC; - } - if (!data) { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - } - - spin_lock_init(&data->lock); - data->csd.func = func; - data->csd.info = info; - data->refs = num_cpus; - - /* - * need to see above stores before the cpumask is valid for the CPU - */ - smp_wmb(); - data->cpumask = mask; - - spin_lock_irqsave(&call_function_lock, flags); - list_add_tail_rcu(&data->csd.list, &call_function_queue); - spin_unlock_irqrestore(&call_function_lock, flags); - - /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); - - /* optionally wait for the CPUs to complete */ - if (wait) - csd_flag_wait(&data->csd); - - return 0; -} -EXPORT_SYMBOL(smp_call_function_mask); - -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @natomic: Unused - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func)(void *), void *info, int natomic, int wait) -{ - int ret; - - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(smp_call_function); diff -puN kernel/workqueue.c~revert-git-block kernel/workqueue.c --- a/kernel/workqueue.c~revert-git-block +++ a/kernel/workqueue.c @@ -155,41 +155,22 @@ static void __queue_work(struct cpu_work * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue - * @cpu: cpu to queue the work on * * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ -int queue_work_on_cpu(struct workqueue_struct *wq, struct work_struct *work, - int cpu) +int queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + __queue_work(wq_per_cpu(wq, get_cpu()), work); + put_cpu(); ret = 1; } - - return ret; -} -EXPORT_SYMBOL_GPL(queue_work_on_cpu); - -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU it was submitted, but there is no - * guarantee that it will be processed by that CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on_cpu(wq, work, get_cpu()); - put_cpu(); return ret; } EXPORT_SYMBOL_GPL(queue_work); _ Patches currently in -mm which might be from akpm@xxxxxxxxxxxxxxxxxxxx are quota-dont-call-sync_fs-from-vfs_quota_off-when-theres-no-quota-turn-off.patch fix-hfsplus-oops-on-image-without-extents.patch rtc-rtc_time_to_tm-use-unsigned-arithmetic.patch atmel_lcdfb-fix-pixclok-divider-calculation.patch memcg-fix-possible-panic-when-config_mm_owner=y.patch drivers-char-synclink_gtc-dont-return-an-uninitialised-local.patch linux-next.patch next-remove-localversion.patch linux-next-git-rejects.patch revert-9p-convert-from-semaphore-to-spinlock.patch ia64-kvm-dont-delete-files-which-we-need.patch revert-lxfb-extend-pll-table-to-support-dotclocks-below-25-mhz.patch revert-acpica-fixes-for-unload-and-ddbhandles.patch acpi-enable-c3-power-state-on-dell-inspiron-8200.patch acpi-video-balcklist-fujitsu-lifebook-s6410.patch git-x86-fixup.patch arch-x86-mm-patc-use-boot_cpu_has.patch x86-setup_force_cpu_cap-dont-do-clear_bitnon-unsigned-long.patch lguest-use-cpu-capability-accessors.patch x86-set_restore_sigmask-avoid-bitop-on-a-u32.patch x86-early_init_centaur-use-set_cpu_cap.patch x86-bitops-take-an-unsigned-long.patch arm-omap1-n770-convert-audio_pwr_sem-in-a-mutex-fix.patch audit_send_reply-fix-error-path-memory-leak.patch cifs-suppress-warning.patch sysfs-provide-a-clue-about-the-effects-of-config_usb_device_class=y.patch fix-gregkh-driver-core-read-dev_name-instead-of-bus_id.patch fix-sparc64-gregkh-driver-core-read-dev_name-instead-of-bus_id.patch zoran-use-correct-type-for-cpu-flags.patch i2c-renesas-highlander-fpga-smbus-support.patch ibmaem-new-driver-for-power-energy-temp-meters-in-ibm-system-x-hardware-ia64-warnings.patch dlm-convert-connections_lock-in-a-mutex-fix.patch drivers-infiniband-hw-mlx4-qpc-fix-uninitialised-var-warning.patch git-input.patch git-jg-misc-git-rejects.patch drivers-scsi-broadsasc-fix-uninitialised-var-warning.patch git-mmc.patch mmc-sd-host-driver-for-ricoh-bay1controllers-fix.patch mmc-sd-host-driver-for-ricoh-bay1controllers-fix-2.patch git-ubifs.patch hysdn-no-longer-broken-on-smp.patch sundance-set-carrier-status-on-link-change-events.patch dm9000-use-delayed-work-to-update-mii-phy-state-fix.patch pcnet32-fix-warning.patch drivers-net-tokenring-3c359c-squish-a-warning.patch drivers-net-tokenring-olympicc-fix-warning.patch update-smc91x-driver-with-arm-versatile-board-info.patch git-battery.patch fs-nfs-callback_xdrc-suppress-uninitialiized-variable-warnings.patch arch-parisc-kernel-unalignedc-use-time_-macros.patch pci-add-pci_match_id-stub-for-config_pci=n.patch pci-hotplug-introduce-pci_slot.patch pci-hotplug-acpi-pci-slot-detection-driver.patch drivers-scsi-qla2xxx-qla_osc-suppress-uninitialized-var-warning.patch revert-git-block.patch git-block-ia64-build-fix.patch git-block-fix-s390-build.patch s390-uninline-spinlock-functions-which-use-smp_processor_id.patch git-unionfs.patch git-unionfs-fixup.patch unionfs-broke.patch git-logfs-fixup.patch drivers-uwb-nehc-processor-flags-have-type-unsigned-long.patch drivers-usb-host-isp1760-hcdc-procesxor-flags-have-type-unsigned-long.patch uwb-fix-scscanf-warning.patch drivers-uwb-wlp-sysfsc-dead-code.patch drivers-uwb-i1480-dfu-macc-fix-min-warning.patch drivers-uwb-i1480-dfu-usbc-fix-size_t-confusion.patch drivers-uwb-whcic-needs-dma-mappingh.patch git-v9fs.patch revert-git-v9fs.patch git-watchdog.patch git-watchdog-git-rejects.patch watchdog-fix-booke_wdtc-on-mpc85xx-smp-system.patch xfs-suppress-uninitialized-var-warnings.patch git-xtensa.patch git-orion-git-rejects.patch ext4-is-busted-on-m68k.patch common-implementation-of-iterative-div-mod-fix.patch common-implementation-of-iterative-div-mod-checkpatch-fixes.patch common-implementation-of-iterative-div-mod-fix-2.patch scsi-dpt_i2o-is-bust-on-ia64.patch colibri-fix-support-for-dm9000-ethernet-device-fix.patch mm-verify-the-page-links-and-memory-model-fix.patch mm-verify-the-page-links-and-memory-model-fix-fix.patch mspec-convert-nopfn-to-fault-fix.patch page-allocator-inlnie-some-__alloc_pages-wrappers-fix.patch kill-generic_file_direct_io-checkpatch-fixes.patch vmscan-give-referenced-active-and-unmapped-pages-a-second-trip-around-the-lru.patch vm-dont-run-touch_buffer-during-buffercache-lookups.patch split-the-typecheck-macros-out-of-include-linux-kernelh.patch locking-add-typecheck-on-irqsave-and-friends-for-correct-flags.patch locking-add-typecheck-on-irqsave-and-friends-for-correct-flags-fix.patch remove-apparently-unused-fd1772h-header-file.patch lib-allow-memparse-to-accept-a-null-and-ignorable-second-parm-checkpatch-fixes.patch rename-warn-to-warning-to-clear-the-namespace-fix.patch add-a-warn-macro-this-is-warn_on-printk-arguments-fix.patch flag-parameters-paccept-fix.patch flag-parameters-anon_inode_getfd-extension-fix.patch flag-parameters-inotify_init-fix.patch flag-parameters-check-magic-constants-alpha-hack.patch drivers-video-aty-radeon_basec-notify-user-if-sysfs_create_bin_file-failed-checkpatch-fixes.patch reiserfs-convert-j_commit_lock-to-mutex-checkpatch-fixes.patch documentation-build-source-files-in-documentation-sub-dir-disable.patch reiser4.patch reiser4-semaphore-fix.patch page-owner-tracking-leak-detector.patch nr_blockdev_pages-in_interrupt-warning.patch slab-leaks3-default-y.patch put_bh-debug.patch shrink_slab-handle-bad-shrinkers.patch getblk-handle-2tb-devices.patch getblk-handle-2tb-devices-fix.patch undeprecate-pci_find_device.patch notify_change-callers-must-hold-i_mutex.patch profile-likely-unlikely-macros.patch drivers-net-bonding-bond_sysfsc-suppress-uninitialized-var-warning.patch w1-build-fix.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html