Signed-off-by: Peter Collingbourne <pcc@xxxxxxxxxx> --- On Thu, Dec 19, 2019 at 12:32 PM Peter Collingbourne <pcc@xxxxxxxxxx> wrote: > > On Wed, Dec 11, 2019 at 10:45 AM Catalin Marinas > <catalin.marinas@xxxxxxx> wrote: > > + if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) > > + update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); > > I don't entirely understand why yet, but I've found that this check is > insufficient for ensuring consistency between SCTLR_EL1.TCF0 and > sctlr_tcf0. In my Android test environment with some processes having > sctlr_tcf0=SCTLR_EL1_TCF0_SYNC and others having sctlr_tcf0=0, I am > seeing intermittent tag failures coming from the sctlr_tcf0=0 > processes. With this patch: > > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index ef3bfa2bf2b1..4e5d02520a51 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -663,6 +663,8 @@ static int do_sea(unsigned long addr, unsigned int > esr, struct pt_regs *regs) > static int do_tag_check_fault(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > { > + printk(KERN_ERR "do_tag_check_fault %lx %lx\n", > + current->thread.sctlr_tcf0, read_sysreg(sctlr_el1)); > do_bad_area(addr, esr, regs); > return 0; > } > > I see dmesg output like this: > > [ 15.249216] do_tag_check_fault 0 c60fc64791d > > showing that SCTLR_EL1.TCF0 became inconsistent with sctlr_tcf0. This > patch fixes the problem for me: > > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index fba89c9f070b..fb012f0baa12 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -518,9 +518,7 @@ static void mte_thread_switch(struct task_struct *next) > if (!system_supports_mte()) > return; > > - /* avoid expensive SCTLR_EL1 accesses if no change */ > - if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) > - update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); > + update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); > update_gcr_el1_excl(next->thread.gcr_excl); > } > #else > @@ -643,15 +641,8 @@ static long set_mte_ctrl(unsigned long arg) > return -EINVAL; > } > > - /* > - * mte_thread_switch() checks current->thread.sctlr_tcf0 as an > - * optimisation. Disable preemption so that it does not see > - * the variable update before the SCTLR_EL1.TCF0 one. > - */ > - preempt_disable(); > current->thread.sctlr_tcf0 = tcf0; > update_sctlr_el1_tcf0(tcf0); > - preempt_enable(); > > current->thread.gcr_excl = (arg & PR_MTE_EXCL_MASK) >> > PR_MTE_EXCL_SHIFT; > update_gcr_el1_excl(current->thread.gcr_excl); > > Since sysreg_clear_set only sets the sysreg if it ended up changing, I > wouldn't expect this to cause a significant performance hit unless > just reading SCTLR_EL1 is expensive. That being said, if the > inconsistency is indicative of a deeper problem, we should probably > address that. I tracked it down to the flush_mte_state() function setting sctlr_tcf0 but failing to update SCTLR_EL1.TCF0. With this patch I am not seeing any more inconsistencies. Peter arch/arm64/kernel/process.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index fba89c9f070b..07e8e7bd3bec 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -319,6 +319,25 @@ static void flush_tagged_addr_state(void) } #ifdef CONFIG_ARM64_MTE +static void update_sctlr_el1_tcf0(u64 tcf0) +{ + /* no need for ISB since this only affects EL0, implicit with ERET */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0); +} + +static void set_sctlr_el1_tcf0(u64 tcf0) +{ + /* + * mte_thread_switch() checks current->thread.sctlr_tcf0 as an + * optimisation. Disable preemption so that it does not see + * the variable update before the SCTLR_EL1.TCF0 one. + */ + preempt_disable(); + current->thread.sctlr_tcf0 = tcf0; + update_sctlr_el1_tcf0(tcf0); + preempt_enable(); +} + static void flush_mte_state(void) { if (!system_supports_mte()) @@ -327,7 +346,7 @@ static void flush_mte_state(void) /* clear any pending asynchronous tag fault */ clear_thread_flag(TIF_MTE_ASYNC_FAULT); /* disable tag checking */ - current->thread.sctlr_tcf0 = 0; + set_sctlr_el1_tcf0(0); } #else static void flush_mte_state(void) @@ -497,12 +516,6 @@ static void ssbs_thread_switch(struct task_struct *next) } #ifdef CONFIG_ARM64_MTE -static void update_sctlr_el1_tcf0(u64 tcf0) -{ - /* no need for ISB since this only affects EL0, implicit with ERET */ - sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0); -} - static void update_gcr_el1_excl(u64 excl) { /* @@ -643,15 +656,7 @@ static long set_mte_ctrl(unsigned long arg) return -EINVAL; } - /* - * mte_thread_switch() checks current->thread.sctlr_tcf0 as an - * optimisation. Disable preemption so that it does not see - * the variable update before the SCTLR_EL1.TCF0 one. - */ - preempt_disable(); - current->thread.sctlr_tcf0 = tcf0; - update_sctlr_el1_tcf0(tcf0); - preempt_enable(); + set_sctlr_el1_tcf0(tcf0); current->thread.gcr_excl = (arg & PR_MTE_EXCL_MASK) >> PR_MTE_EXCL_SHIFT; update_gcr_el1_excl(current->thread.gcr_excl); -- 2.24.1.735.g03f4e72817-goog