Re: [PATCH 2/6] __wr_after_init: write rare for static allocation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 5 Dec 2018 15:13:56 -0800
Andy Lutomirski <luto@xxxxxxxxxx> wrote:

> I added some s390 and powerpc people.
> 
> On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@xxxxxxxxx> wrote:
> >
> > Implementation of write rare for statically allocated data, located in a
> > specific memory section through the use of the __write_rare label.
> >
> > The basic functions are:
> > - wr_memset(): write rare counterpart of memset()
> > - wr_memcpy(): write rare counterpart of memcpy()
> > - wr_assign(): write rare counterpart of the assignment ('=') operator
> > - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
> >
> > The implementation is based on code from Andy Lutomirski and Nadav Amit
> > for patching the text on x86 [here goes reference to commits, once merged]
> >
> > The modification of write protected data is done through an alternate
> > mapping of the same pages, as writable.
> > This mapping is local to each core and is active only for the duration
> > of each write operation.
> > Local interrupts are disabled, while the alternate mapping is active.
> >
> > In theory, it could introduce a non-predictable delay, in a preemptible
> > system, however the amount of data to be altered is likely to be far
> > smaller than a page.
> >
> > Signed-off-by: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> >
> > CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> > CC: Nadav Amit <nadav.amit@xxxxxxxxx>
> > CC: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> > CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> > CC: Kees Cook <keescook@xxxxxxxxxxxx>
> > CC: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> > CC: linux-integrity@xxxxxxxxxxxxxxx
> > CC: kernel-hardening@xxxxxxxxxxxxxxxxxx
> > CC: linux-mm@xxxxxxxxx
> > CC: linux-kernel@xxxxxxxxxxxxxxx
> > ---
> >  include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
> >  init/main.c           |   2 +
> >  mm/Kconfig            |   4 ++
> >  mm/Makefile           |   1 +
> >  mm/prmem.c            | 124 +++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 264 insertions(+)
> >  create mode 100644 include/linux/prmem.h
> >  create mode 100644 mm/prmem.c
> >
> > diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> > new file mode 100644
> > index 000000000000..b0131c1f5dc0
> > --- /dev/null
> > +++ b/include/linux/prmem.h
> > @@ -0,0 +1,133 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * prmem.h: Header for memory protection library
> > + *
> > + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> > + *
> > + * Support for:
> > + * - statically allocated write rare data
> > + */
> > +
> > +#ifndef _LINUX_PRMEM_H
> > +#define _LINUX_PRMEM_H
> > +
> > +#include <linux/set_memory.h>
> > +#include <linux/mm.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/string.h>
> > +#include <linux/slab.h>
> > +#include <linux/mutex.h>
> > +#include <linux/compiler.h>
> > +#include <linux/irqflags.h>
> > +
> > +/**
> > + * memtst() - test n bytes of the source to match the c value
> > + * @p: beginning of the memory to test
> > + * @c: byte to compare against
> > + * @len: amount of bytes to test
> > + *
> > + * Returns 0 on success, non-zero otherwise.
> > + */
> > +static inline int memtst(void *p, int c, __kernel_size_t len)
> > +{
> > +       __kernel_size_t i;
> > +
> > +       for (i = 0; i < len; i++) {
> > +               u8 d =  *(i + (u8 *)p) - (u8)c;
> > +
> > +               if (unlikely(d))
> > +                       return d;
> > +       }
> > +       return 0;
> > +}
> > +
> > +
> > +#ifndef CONFIG_PRMEM
> > +
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > +       return memset(p, c, len);
> > +}
> > +
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > +       return memcpy(p, q, size);
> > +}
> > +
> > +#define wr_assign(var, val)    ((var) = (val))
> > +
> > +#define wr_rcu_assign_pointer(p, v)    \
> > +       rcu_assign_pointer(p, v)
> > +
> > +#else
> > +
> > +enum wr_op_type {
> > +       WR_MEMCPY,
> > +       WR_MEMSET,
> > +       WR_RCU_ASSIGN_PTR,
> > +       WR_OPS_NUMBER,
> > +};
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > +             enum wr_op_type op);
> > +
> > +/**
> > + * wr_memset() - sets n bytes of the destination to the c value
> > + * @p: beginning of the memory to write to
> > + * @c: byte to replicate
> > + * @len: amount of bytes to copy
> > + *
> > + * Returns true on success, false otherwise.
> > + */
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > +       return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> > +}
> > +
> > +/**
> > + * wr_memcpy() - copyes n bytes from source to destination
> > + * @dst: beginning of the memory to write to
> > + * @src: beginning of the memory to read from
> > + * @n_bytes: amount of bytes to copy
> > + *
> > + * Returns pointer to the destination
> > + */
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > +       return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> > +}
> > +
> > +/**
> > + * wr_assign() - sets a write-rare variable to a specified value
> > + * @var: the variable to set
> > + * @val: the new value
> > + *
> > + * Returns: the variable
> > + *
> > + * Note: it might be possible to optimize this, to use wr_memset in some
> > + * cases (maybe with NULL?).
> > + */
> > +
> > +#define wr_assign(var, val) ({                 \
> > +       typeof(var) tmp = (typeof(var))val;     \
> > +                                               \
> > +       wr_memcpy(&var, &tmp, sizeof(var));     \
> > +       var;                                    \
> > +})
> > +
> > +/**
> > + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> > + * @p: the rcu pointer
> > + * @v: the new value
> > + *
> > + * Returns the value assigned to the rcu pointer.
> > + *
> > + * It is provided as macro, to match rcu_assign_pointer()
> > + */
> > +#define wr_rcu_assign_pointer(p, v) ({                                 \
> > +       __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR);    \
> > +       p;                                                              \
> > +})
> > +#endif
> > +#endif
> > diff --git a/init/main.c b/init/main.c
> > index a461150adfb1..a36f2e54f937 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
> >  void __init __weak mem_encrypt_init(void) { }
> >
> >  void __init __weak poking_init(void) { }
> > +void __init __weak wr_poking_init(void) { }
> >
> >  bool initcall_debug;
> >  core_param(initcall_debug, initcall_debug, bool, 0644);
> > @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
> >         delayacct_init();
> >
> >         poking_init();
> > +       wr_poking_init();
> >         check_bugs();
> >
> >         acpi_subsystem_init();
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index d85e39da47ae..9b09339c027f 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
> >  config MEMORY_ISOLATION
> >         bool
> >
> > +config PRMEM
> > +       def_bool n
> > +       depends on STRICT_KERNEL_RWX && X86_64
> > +
> >  #
> >  # Only be set on architectures that have completely implemented memory hotplug
> >  # feature. If you are not sure, don't touch it.
> > diff --git a/mm/Makefile b/mm/Makefile
> > index d210cc9d6f80..ef3867c16ce0 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
> >  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> >  obj-$(CONFIG_SLOB) += slob.o
> >  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> > +obj-$(CONFIG_PRMEM) += prmem.o
> >  obj-$(CONFIG_KSM) += ksm.o
> >  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
> >  obj-$(CONFIG_SLAB) += slab.o
> > diff --git a/mm/prmem.c b/mm/prmem.c
> > new file mode 100644
> > index 000000000000..e8ab76701831
> > --- /dev/null
> > +++ b/mm/prmem.c
> > @@ -0,0 +1,124 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * prmem.c: Memory Protection Library
> > + *
> > + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/string.h>
> > +#include <linux/compiler.h>
> > +#include <linux/slab.h>
> > +#include <linux/mmu_context.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/prmem.h>
> > +
> > +static __ro_after_init bool wr_ready;
> > +static __ro_after_init struct mm_struct *wr_poking_mm;
> > +static __ro_after_init unsigned long wr_poking_base;
> > +
> > +/*
> > + * The following two variables are statically allocated by the linker
> > + * script at the the boundaries of the memory region (rounded up to
> > + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> > + */
> > +extern long __start_wr_after_init;
> > +extern long __end_wr_after_init;
> > +
> > +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> > +{
> > +       unsigned long start = (unsigned long)&__start_wr_after_init;
> > +       unsigned long end = (unsigned long)&__end_wr_after_init;
> > +       unsigned long low = ptr;
> > +       unsigned long high = ptr + size;
> > +
> > +       return likely(start <= low && low <= high && high <= end);
> > +}
> > +
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > +             enum wr_op_type op)
> > +{  
> 
> You might end up wanting something like:
> 
> #ifdef __arch_wr_op
> return __arch_wr_op(...);
> #endif
> 
> if an arch (s390? powerpc?) decides to have a totally different
> implementation of this.
> 
> Hi s390 and powerpc people: it would be nice if this generic
> implementation *worked* on your architectures and that it will allow
> you to add some straightforward way to add a better arch-specific
> implementation if you think that would be better.

As the code is right now I can guarantee that it will not work on s390.

> > +       temporary_mm_state_t prev;
> > +       unsigned long flags;
> > +       unsigned long offset;
> > +       unsigned long wr_poking_addr;
> > +
> > +       /* Confirm that the writable mapping exists. */
> > +       BUG_ON(!wr_ready);
> > +
> > +       if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> > +           WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> > +               return (void *)dst;
> > +
> > +       offset = dst - (unsigned long)&__start_wr_after_init;
> > +       wr_poking_addr = wr_poking_base + offset;
> > +       local_irq_save(flags);
> > +       prev = use_temporary_mm(wr_poking_mm);
> > +
> > +       kasan_disable_current();
> > +       if (op == WR_MEMCPY)
> > +               memcpy((void *)wr_poking_addr, (void *)src, len);
> > +       else if (op == WR_MEMSET)
> > +               memset((u8 *)wr_poking_addr, (u8)src, len);
> > +       else if (op == WR_RCU_ASSIGN_PTR)
> > +               /* generic version of rcu_assign_pointer */
> > +               smp_store_release((void **)wr_poking_addr,
> > +                                 RCU_INITIALIZER((void **)src));
> > +       kasan_enable_current();  
> 
> Hmm.  I suspect this will explode quite badly on sane architectures
> like s390.  (In my book, despite how weird s390 is, it has a vastly
> nicer model of "user" memory than any other architecture I know
> of...).  I think you should use copy_to_user(), etc, instead.  I'm not
> entirely sure what the best smp_store_release() replacement is.
> Making this change may also mean you can get rid of the
> kasan_disable_current().

use_temporary_mm() replaces the *user* mm, for s390 this is completely
independent from the kernel mm (different control register cr1 vs cr7).
The wr_poking_addr is not available when running in the kernel address
space.

> > +
> > +       barrier(); /* XXX redundant? */  
> 
> I think it's redundant.  If unuse_temporary_mm() allows earlier stores
> to hit the wrong address space, then something is very very wrong, and
> something is also very very wrong if the optimizer starts moving
> stores across a function call that is most definitely a barrier.
> 
> > +
> > +       unuse_temporary_mm(prev);
> > +       /* XXX make the verification optional? */
> > +       if (op == WR_MEMCPY)
> > +               BUG_ON(memcmp((void *)dst, (void *)src, len));
> > +       else if (op == WR_MEMSET)
> > +               BUG_ON(memtst((void *)dst, (u8)src, len));
> > +       else if (op == WR_RCU_ASSIGN_PTR)
> > +               BUG_ON(*(unsigned long *)dst != src);  
> 
> Hmm.  If you allowed cmpxchg or even plain xchg, then these bug_ons
> would be thoroughly buggy, but maybe they're okay.  But they should,
> at most, be WARN_ON_ONCE(), given that you can trigger them by writing
> the same addresses from two threads at once, and this isn't even
> entirely obviously bogus given the presence of smp_store_release().
> 
> > +       local_irq_restore(flags);
> > +       return (void *)dst;
> > +}
> > +
> > +struct mm_struct *copy_init_mm(void);
> > +void __init wr_poking_init(void)
> > +{
> > +       unsigned long start = (unsigned long)&__start_wr_after_init;
> > +       unsigned long end = (unsigned long)&__end_wr_after_init;
> > +       unsigned long i;
> > +       unsigned long wr_range;
> > +
> > +       wr_poking_mm = copy_init_mm();
> > +       BUG_ON(!wr_poking_mm);
> > +
> > +       /* XXX What if it's too large to fit in the task unmapped mem? */
> > +       wr_range = round_up(end - start, PAGE_SIZE);
> > +
> > +       /* Randomize the poking address base*/
> > +       wr_poking_base = TASK_UNMAPPED_BASE +
> > +               (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> > +               (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));

The wr_poking_base address is a user space address, no? This will be usable
only with the uaccess primitives on s390.

> > +
> > +       /* Create alternate mapping for the entire wr_after_init range. */
> > +       for (i = start; i < end; i += PAGE_SIZE) {
> > +               struct page *page;
> > +               spinlock_t *ptl;
> > +               pte_t pte;
> > +               pte_t *ptep;
> > +               unsigned long wr_poking_addr;
> > +
> > +               BUG_ON(!(page = virt_to_page(i)));
> > +               wr_poking_addr = i - start + wr_poking_base;
> > +
> > +               /* The lock is not needed, but avoids open-coding. */
> > +               ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> > +               VM_BUG_ON(!ptep);
> > +
> > +               pte = mk_pte(page, PAGE_KERNEL);
> > +               set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> > +               spin_unlock(ptl);
> > +       }
> > +       wr_ready = true;
> > +}
> > --
> > 2.19.1
> >  
> 


-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux