Hi David. On Tue, Apr 11, 2023 at 04:25:09PM +0200, David Hildenbrand wrote: > On sparc64, there is no HW modified bit, therefore, SW tracks via a SW > bit if the PTE is dirty via pte_mkdirty(). However, pte_mkdirty() > currently also unconditionally sets the HW writable bit, which is wrong. > > pte_mkdirty() is not supposed to make a PTE actually writable, unless the > SW writable bit -- pte_write() -- indicates that the PTE is not > write-protected. Fortunately, sparc64 also defines a SW writable bit. > > For example, this already turned into a problem in the context of > THP splitting as documented in commit 624a2c94f5b7 ("Partly revert "mm/thp: > carry over dirty bit when thp splits on pmd""), and for page migration, > as documented in commit 96a9c287e25d ("mm/migrate: fix wrongly apply write > bit after mkdirty on sparc64"). > > Also, we might want to use the dirty PTE bit in the context of KSM with > shared zeropage [1], whereby setting the page writable would be > problematic. > > But more general, any code that might end up setting a PTE/PMD dirty > inside a VM without write permissions is possibly broken, > > Before this commit (sun4u in QEMU): > root@debian:~/linux/tools/testing/selftests/mm# ./mkdirty > # [INFO] detected THP size: 8192 KiB > TAP version 13 > 1..6 > # [INFO] PTRACE write access > not ok 1 SIGSEGV generated, page not modified > # [INFO] PTRACE write access to THP > not ok 2 SIGSEGV generated, page not modified > # [INFO] Page migration > ok 3 SIGSEGV generated, page not modified > # [INFO] Page migration of THP > ok 4 SIGSEGV generated, page not modified > # [INFO] PTE-mapping a THP > ok 5 SIGSEGV generated, page not modified > # [INFO] UFFDIO_COPY > not ok 6 SIGSEGV generated, page not modified > Bail out! 3 out of 6 tests failed > # Totals: pass:3 fail:3 xfail:0 xpass:0 skip:0 error:0 > > Test #3,#4,#5 pass ever since we added some MM workarounds, the > underlying issue remains. > > Let's fix the remaining issues and prepare for reverting the workarounds > by setting the HW writable bit only if both, the SW dirty bit and the SW > writable bit are set. > > We have to move pte_dirty() and pte_dirty() up. The code patching One of the pte_dirty() should be replaced with pte_write(). It would have been nice to separate moving and changes in two patches, but keeping it together works too. > mechanism and handling constants > 22bit is a bit special on sparc64. > > The ASM logic in pte_mkdirty() and pte_mkwrite() match the logic in > pte_mkold() to create the mask depending on the machine type. The ASM > logic in __pte_mkhwwrite() matches the logic in pte_present(), just > using an "or" instead of an "and" instruction. > > With this commit (sun4u in QEMU): > root@debian:~/linux/tools/testing/selftests/mm# ./mkdirty > # [INFO] detected THP size: 8192 KiB > TAP version 13 > 1..6 > # [INFO] PTRACE write access > ok 1 SIGSEGV generated, page not modified > # [INFO] PTRACE write access to THP > ok 2 SIGSEGV generated, page not modified > # [INFO] Page migration > ok 3 SIGSEGV generated, page not modified > # [INFO] Page migration of THP > ok 4 SIGSEGV generated, page not modified > # [INFO] PTE-mapping a THP > ok 5 SIGSEGV generated, page not modified > # [INFO] UFFDIO_COPY > ok 6 SIGSEGV generated, page not modified > # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0 Nice! > > This handling seems to have been in place forever. > > [1] https://lkml.kernel.org/r/533a7c3d-3a48-b16b-b421-6e8386e0b142@xxxxxxxxxx > > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> I tried to follow your changes, but my knowledge of gcc assembler failed me. But based on the nice and detailed change log and the code I managed to understand: Acked-by: Sam Ravnborg <sam@xxxxxxxxxxxx> > --- > arch/sparc/include/asm/pgtable_64.h | 116 ++++++++++++++++------------ > 1 file changed, 66 insertions(+), 50 deletions(-) > > diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h > index 2dc8d4641734..5563efa1a19f 100644 > --- a/arch/sparc/include/asm/pgtable_64.h > +++ b/arch/sparc/include/asm/pgtable_64.h > @@ -357,6 +357,42 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) > */ > #define pgprot_noncached pgprot_noncached > > +static inline unsigned long pte_dirty(pte_t pte) > +{ > + unsigned long mask; > + > + __asm__ __volatile__( > + "\n661: mov %1, %0\n" > + " nop\n" > + " .section .sun4v_2insn_patch, \"ax\"\n" > + " .word 661b\n" > + " sethi %%uhi(%2), %0\n" > + " sllx %0, 32, %0\n" > + " .previous\n" > + : "=r" (mask) > + : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V)); > + > + return (pte_val(pte) & mask); > +} > + > +static inline unsigned long pte_write(pte_t pte) > +{ > + unsigned long mask; > + > + __asm__ __volatile__( > + "\n661: mov %1, %0\n" > + " nop\n" > + " .section .sun4v_2insn_patch, \"ax\"\n" > + " .word 661b\n" > + " sethi %%uhi(%2), %0\n" > + " sllx %0, 32, %0\n" > + " .previous\n" > + : "=r" (mask) > + : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V)); > + > + return (pte_val(pte) & mask); > +} > + > #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) > pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); > #define arch_make_huge_pte arch_make_huge_pte > @@ -418,28 +454,43 @@ static inline bool is_hugetlb_pte(pte_t pte) > } > #endif > > +static inline pte_t __pte_mkhwwrite(pte_t pte) > +{ > + unsigned long val = pte_val(pte); > + > + /* > + * Note: we only want to set the HW writable bit if the SW writable bit > + * and the SW dirty bit are set. > + */ > + __asm__ __volatile__( > + "\n661: or %0, %2, %0\n" > + " .section .sun4v_1insn_patch, \"ax\"\n" > + " .word 661b\n" > + " or %0, %3, %0\n" > + " .previous\n" > + : "=r" (val) > + : "0" (val), "i" (_PAGE_W_4U), "i" (_PAGE_W_4V)); > + > + return __pte(val); > +} > + > static inline pte_t pte_mkdirty(pte_t pte) > { > - unsigned long val = pte_val(pte), tmp; > + unsigned long val = pte_val(pte), mask; > > __asm__ __volatile__( > - "\n661: or %0, %3, %0\n" > - " nop\n" > - "\n662: nop\n" > + "\n661: mov %1, %0\n" > " nop\n" > " .section .sun4v_2insn_patch, \"ax\"\n" > " .word 661b\n" > - " sethi %%uhi(%4), %1\n" > - " sllx %1, 32, %1\n" > - " .word 662b\n" > - " or %1, %%lo(%4), %1\n" > - " or %0, %1, %0\n" > + " sethi %%uhi(%2), %0\n" > + " sllx %0, 32, %0\n" > " .previous\n" > - : "=r" (val), "=r" (tmp) > - : "0" (val), "i" (_PAGE_MODIFIED_4U | _PAGE_W_4U), > - "i" (_PAGE_MODIFIED_4V | _PAGE_W_4V)); > + : "=r" (mask) > + : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V)); > > - return __pte(val); > + pte = __pte(val | mask); > + return pte_write(pte) ? __pte_mkhwwrite(pte) : pte; > } > > static inline pte_t pte_mkclean(pte_t pte) > @@ -481,7 +532,8 @@ static inline pte_t pte_mkwrite(pte_t pte) > : "=r" (mask) > : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V)); > > - return __pte(val | mask); > + pte = __pte(val | mask); > + return pte_dirty(pte) ? __pte_mkhwwrite(pte) : pte; > } > > static inline pte_t pte_wrprotect(pte_t pte) > @@ -584,42 +636,6 @@ static inline unsigned long pte_young(pte_t pte) > return (pte_val(pte) & mask); > } > > -static inline unsigned long pte_dirty(pte_t pte) > -{ > - unsigned long mask; > - > - __asm__ __volatile__( > - "\n661: mov %1, %0\n" > - " nop\n" > - " .section .sun4v_2insn_patch, \"ax\"\n" > - " .word 661b\n" > - " sethi %%uhi(%2), %0\n" > - " sllx %0, 32, %0\n" > - " .previous\n" > - : "=r" (mask) > - : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V)); > - > - return (pte_val(pte) & mask); > -} > - > -static inline unsigned long pte_write(pte_t pte) > -{ > - unsigned long mask; > - > - __asm__ __volatile__( > - "\n661: mov %1, %0\n" > - " nop\n" > - " .section .sun4v_2insn_patch, \"ax\"\n" > - " .word 661b\n" > - " sethi %%uhi(%2), %0\n" > - " sllx %0, 32, %0\n" > - " .previous\n" > - : "=r" (mask) > - : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V)); > - > - return (pte_val(pte) & mask); > -} > - > static inline unsigned long pte_exec(pte_t pte) > { > unsigned long mask; > -- > 2.39.2