STFill Buffer locate between core and L1 cache, it causes memory access out of order, so writel/outl need a barrier. Loongson 3 has a bug that di cannot save irqflag, so we need a mfc0. On Tue, Jan 26, 2016 at 10:19 PM, James Hogan <james.hogan@xxxxxxxxxx> wrote: > On Tue, Jan 26, 2016 at 09:26:24PM +0800, Huacai Chen wrote: >> New Loongson 3 CPU (since Loongson-3A R2, as opposed to Loongson-3A R1, >> Loongson-3B R1 and Loongson-3B R2) has many enhancements, such as FTLB, >> L1-VCache, EI/DI/Wait/Prefetch instruction, DSP/DSPv2 ASE, User Local >> register, Read-Inhibit/Execute-Inhibit, SFB (Store Fill Buffer), Fast >> TLB refill support, etc. >> >> This patch introduce a config option, CONFIG_LOONGSON3_ENHANCEMENT, to >> enable those enhancements which cannot be probed at run time. If you >> want a generic kernel to run on all Loongson 3 machines, please say 'N' >> here. If you want a high-performance kernel to run on new Loongson 3 >> machines only, please say 'Y' here. >> >> Signed-off-by: Huacai Chen <chenhc@xxxxxxxxxx> >> --- >> arch/mips/Kconfig | 18 ++++++++++++++++++ >> arch/mips/include/asm/hazards.h | 7 ++++--- >> arch/mips/include/asm/io.h | 10 +++++----- >> arch/mips/include/asm/irqflags.h | 5 +++++ >> .../include/asm/mach-loongson64/kernel-entry-init.h | 12 ++++++++++++ >> arch/mips/mm/c-r4k.c | 3 +++ >> arch/mips/mm/page.c | 9 +++++++++ >> 7 files changed, 56 insertions(+), 8 deletions(-) >> >> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig >> index 15faaf0..e6d6f7b 100644 >> --- a/arch/mips/Kconfig >> +++ b/arch/mips/Kconfig >> @@ -1349,6 +1349,24 @@ config CPU_LOONGSON3 >> The Loongson 3 processor implements the MIPS64R2 instruction >> set with many extensions. >> >> +config LOONGSON3_ENHANCEMENT >> + bool "New Loongson 3 CPU Enhancements" >> + default n > > no need, n is the default. > >> + select CPU_MIPSR2 >> + select CPU_HAS_PREFETCH >> + depends on CPU_LOONGSON3 >> + help >> + New Loongson 3 CPU (since Loongson-3A R2, as opposed to Loongson-3A >> + R1, Loongson-3B R1 and Loongson-3B R2) has many enhancements, such as >> + FTLB, L1-VCache, EI/DI/Wait/Prefetch instruction, DSP/DSPv2 ASE, User >> + Local register, Read-Inhibit/Execute-Inhibit, SFB (Store Fill Buffer), >> + Fast TLB refill support, etc. >> + >> + This option enable those enhancements which cannot be probed at run >> + time. If you want a generic kernel to run on all Loongson 3 machines, >> + please say 'N' here. If you want a high-performance kernel to run on >> + new Loongson 3 machines only, please say 'Y' here. >> + >> config CPU_LOONGSON2E >> bool "Loongson 2E" >> depends on SYS_HAS_CPU_LOONGSON2E >> diff --git a/arch/mips/include/asm/hazards.h b/arch/mips/include/asm/hazards.h >> index 7b99efd..dbb1eb6 100644 >> --- a/arch/mips/include/asm/hazards.h >> +++ b/arch/mips/include/asm/hazards.h >> @@ -22,7 +22,8 @@ >> /* >> * TLB hazards >> */ >> -#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) && !defined(CONFIG_CPU_CAVIUM_OCTEON) >> +#if (defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)) && \ >> + !defined(CONFIG_CPU_CAVIUM_OCTEON) && !defined(CONFIG_LOONGSON3_ENHANCEMENT) >> >> /* >> * MIPSR2 defines ehb for hazard avoidance >> @@ -155,8 +156,8 @@ do { \ >> } while (0) >> >> #elif defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_CPU_CAVIUM_OCTEON) || \ >> - defined(CONFIG_CPU_LOONGSON2) || defined(CONFIG_CPU_R10000) || \ >> - defined(CONFIG_CPU_R5500) || defined(CONFIG_CPU_XLR) >> + defined(CONFIG_CPU_LOONGSON2) || defined(CONFIG_LOONGSON3_ENHANCEMENT) || \ >> + defined(CONFIG_CPU_R10000) || defined(CONFIG_CPU_R5500) || defined(CONFIG_CPU_XLR) >> >> /* >> * R10000 rocks - all hazards handled in hardware, so this becomes a nobrainer. >> diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h >> index 2b4dc7a..ecabc00 100644 >> --- a/arch/mips/include/asm/io.h >> +++ b/arch/mips/include/asm/io.h >> @@ -304,10 +304,10 @@ static inline void iounmap(const volatile void __iomem *addr) >> #undef __IS_KSEG1 >> } >> >> -#ifdef CONFIG_CPU_CAVIUM_OCTEON >> -#define war_octeon_io_reorder_wmb() wmb() >> +#if defined(CONFIG_CPU_CAVIUM_OCTEON) || defined(CONFIG_LOONGSON3_ENHANCEMENT) >> +#define war_io_reorder_wmb() wmb() >> #else >> -#define war_octeon_io_reorder_wmb() do { } while (0) >> +#define war_io_reorder_wmb() do { } while (0) >> #endif > > Doesn't this slow things down when enabled, or is it required due to > STFill buffer being enabled or something? > >> >> #define __BUILD_MEMORY_SINGLE(pfx, bwlq, type, irq) \ >> @@ -318,7 +318,7 @@ static inline void pfx##write##bwlq(type val, \ >> volatile type *__mem; \ >> type __val; \ >> \ >> - war_octeon_io_reorder_wmb(); \ >> + war_io_reorder_wmb(); \ >> \ >> __mem = (void *)__swizzle_addr_##bwlq((unsigned long)(mem)); \ >> \ >> @@ -387,7 +387,7 @@ static inline void pfx##out##bwlq##p(type val, unsigned long port) \ >> volatile type *__addr; \ >> type __val; \ >> \ >> - war_octeon_io_reorder_wmb(); \ >> + war_io_reorder_wmb(); \ >> \ >> __addr = (void *)__swizzle_addr_##bwlq(mips_io_port_base + port); \ >> \ >> diff --git a/arch/mips/include/asm/irqflags.h b/arch/mips/include/asm/irqflags.h >> index 65c351e..12f80b5 100644 >> --- a/arch/mips/include/asm/irqflags.h >> +++ b/arch/mips/include/asm/irqflags.h >> @@ -41,7 +41,12 @@ static inline unsigned long arch_local_irq_save(void) >> " .set push \n" >> " .set reorder \n" >> " .set noat \n" >> +#if defined(CONFIG_LOONGSON3_ENHANCEMENT) >> + " mfc0 %[flags], $12 \n" >> + " di \n" > > Does this somehow help performance, or is it necessary when STFill > buffer is enabled? > >> +#else >> " di %[flags] \n" >> +#endif >> " andi %[flags], 1 \n" >> " " __stringify(__irq_disable_hazard) " \n" >> " .set pop \n" >> diff --git a/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h b/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h >> index da83482..8393bc54 100644 >> --- a/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h >> +++ b/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h >> @@ -26,6 +26,12 @@ >> mfc0 t0, $5, 1 >> or t0, (0x1 << 29) >> mtc0 t0, $5, 1 >> +#ifdef CONFIG_LOONGSON3_ENHANCEMENT >> + /* Enable STFill Buffer */ >> + mfc0 t0, $16, 6 >> + or t0, 0x100 >> + mtc0 t0, $16, 6 >> +#endif >> _ehb >> .set pop >> #endif >> @@ -46,6 +52,12 @@ >> mfc0 t0, $5, 1 >> or t0, (0x1 << 29) >> mtc0 t0, $5, 1 >> +#ifdef CONFIG_LOONGSON3_ENHANCEMENT >> + /* Enable STFill Buffer */ >> + mfc0 t0, $16, 6 >> + or t0, 0x100 >> + mtc0 t0, $16, 6 >> +#endif > > What does the STFill buffer do? > > Given that you can get a portable kernel without this, can this not be > done from C code depending on the PRid? > >> _ehb >> .set pop >> #endif >> diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c >> index 65fb28c..903d8da 100644 >> --- a/arch/mips/mm/c-r4k.c >> +++ b/arch/mips/mm/c-r4k.c >> @@ -1170,6 +1170,9 @@ static void probe_pcache(void) >> c->dcache.ways * >> c->dcache.linesz; >> c->dcache.waybit = 0; >> +#ifdef CONFIG_CPU_HAS_PREFETCH >> + c->options |= MIPS_CPU_PREFETCH; >> +#endif > > Can't do that based on PRid? > > Cheers > James > >> break; >> >> case CPU_CAVIUM_OCTEON3: >> diff --git a/arch/mips/mm/page.c b/arch/mips/mm/page.c >> index 885d73f..c41953c 100644 >> --- a/arch/mips/mm/page.c >> +++ b/arch/mips/mm/page.c >> @@ -188,6 +188,15 @@ static void set_prefetch_parameters(void) >> } >> break; >> >> + case CPU_LOONGSON3: >> + /* Loongson-3 only support the Pref_Load/Pref_Store. */ >> + pref_bias_clear_store = 128; >> + pref_bias_copy_load = 128; >> + pref_bias_copy_store = 128; >> + pref_src_mode = Pref_Load; >> + pref_dst_mode = Pref_Store; >> + break; >> + >> default: >> pref_bias_clear_store = 128; >> pref_bias_copy_load = 256; >> -- >> 2.4.6 >> >> >> >> >>