The only atomic operation on parisc is the ldcw instruction, which loads a 32bit word from an address and replaces it by zero (load and clear word). This instruction is used to implement kernel internal spinlocks. Up to now we tried to optimize the ldcw usage by using the coherent completer of this command, which operates on the cache (instead of memory) and thus might speed up things, and which was enabled by default on our 64bit kernel build. But we still see runtime locking problems, so this patch changes it back to use ldcw for 32- and 64-bit kernels, and live-patches it at runtime to use the coherent completer when running on a uniprocessor machine. Signed-off-by: Helge Deller <deller@xxxxxx> diff --git a/arch/parisc/include/asm/alternative.h b/arch/parisc/include/asm/alternative.h index 0ec54f43d6d2..2667ec07acb9 100644 --- a/arch/parisc/include/asm/alternative.h +++ b/arch/parisc/include/asm/alternative.h @@ -11,6 +11,7 @@ #define ALT_COND_RUN_ON_QEMU 0x20 /* if running on QEMU */ #define INSN_PxTLB 0x02 /* modify pdtlb, pitlb */ +#define INSN_LDCW_CO 0x03 /* change cc in ldcw to ldcw,co */ #define INSN_NOP 0x08000240 /* nop */ #ifndef __ASSEMBLY__ diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h index a39250cb7dfc..8d6e76279d80 100644 --- a/arch/parisc/include/asm/assembly.h +++ b/arch/parisc/include/asm/assembly.h @@ -44,8 +44,9 @@ #define CALLEE_SAVE_FRAME_SIZE (CALLEE_REG_FRAME_SIZE + CALLEE_FLOAT_FRAME_SIZE) +#define LDCW ALTERNATIVE(., .+4, ALT_COND_NO_SMP, INSN_LDCW_CO) ! ldcw + #ifdef CONFIG_PA20 -#define LDCW ldcw,co #define BL b,l # ifdef CONFIG_64BIT # define PA_ASM_LEVEL 2.0w @@ -53,7 +54,6 @@ # define PA_ASM_LEVEL 2.0 # endif #else -#define LDCW ldcw #define BL bl #define PA_ASM_LEVEL 1.1 #endif diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index e080143e79a3..59130b0dbc3e 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -2,7 +2,8 @@ #ifndef __PARISC_LDCW_H #define __PARISC_LDCW_H -#ifndef CONFIG_PA20 +#include <asm/alternative.h> + /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data, and GCC only guarantees 8-byte alignment for stack locals, we can't be assured of 16-byte alignment for atomic lock data even if we @@ -19,22 +20,6 @@ & ~(__PA_LDCW_ALIGNMENT - 1); \ (volatile unsigned int *) __ret; \ }) -#define __LDCW "ldcw" - -#else /*CONFIG_PA20*/ -/* From: "Jim Hull" <jim.hull of hp.com> - I've attached a summary of the change, but basically, for PA 2.0, as - long as the ",CO" (coherent operation) completer is specified, then the - 16-byte alignment requirement for ldcw and ldcd is relaxed, and instead - they only require "natural" alignment (4-byte for ldcw, 8-byte for - ldcd). */ - -#define __PA_LDCW_ALIGNMENT 4 -#define __PA_LDCW_ALIGN_ORDER 2 -#define __ldcw_align(a) (&(a)->slock) -#define __LDCW "ldcw,co" - -#endif /*!CONFIG_PA20*/ /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*. We don't explicitly expose that "*a" may be written as reload @@ -46,7 +31,8 @@ usually used within code blocks surrounded by memory barriers. */ #define __ldcw(a) ({ \ unsigned __ret; \ - __asm__ __volatile__(__LDCW " 0(%1),%0" \ + __asm__ __volatile__("ldcw 0(%1),%0" \ + ALTERNATIVE(ALT_COND_NO_SMP, INSN_LDCW_CO) \ : "=r" (__ret) : "r" (a) : "memory"); \ __ret; \ }) diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index 42979c5704dc..82d2384c3f22 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -3,13 +3,8 @@ #define __ASM_SPINLOCK_TYPES_H typedef struct { -#ifdef CONFIG_PA20 - volatile unsigned int slock; -# define __ARCH_SPIN_LOCK_UNLOCKED { 1 } -#else volatile unsigned int lock[4]; # define __ARCH_SPIN_LOCK_UNLOCKED { { 1, 1, 1, 1 } } -#endif } arch_spinlock_t; typedef struct { diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c index 3c66d5c4d90d..cf83a801cc2a 100644 --- a/arch/parisc/kernel/alternative.c +++ b/arch/parisc/kernel/alternative.c @@ -69,6 +69,12 @@ void __init_or_module apply_alternatives(struct alt_instr *start, if (boot_cpu_data.cpu_type >= pcxu) /* >= pa2.0 ? */ replacement |= (1 << 10); /* set el bit */ } + /* Want to replace ldcw by a ldcw,co instruction? */ + if (replacement == INSN_LDCW_CO) { + replacement = *from; + /* set cache-coherent completer bits: */ + replacement |= (0x01 << 10); + } /* * Replace instruction with NOPs?