> -----Original Message----- > From: Vitaly Kuznetsov [mailto:vkuznets@xxxxxxxxxx] > Sent: Friday, April 7, 2017 4:27 AM > To: devel@xxxxxxxxxxxxxxxxxxxxxx; x86@xxxxxxxxxx > Cc: linux-kernel@xxxxxxxxxxxxxxx; KY Srinivasan <kys@xxxxxxxxxxxxx>; > Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>; Stephen Hemminger > <sthemmin@xxxxxxxxxxxxx>; Thomas Gleixner <tglx@xxxxxxxxxxxxx>; Ingo > Molnar <mingo@xxxxxxxxxx>; H. Peter Anvin <hpa@xxxxxxxxx>; Steven > Rostedt <rostedt@xxxxxxxxxxx>; Jork Loeser <Jork.Loeser@xxxxxxxxxxxxx> > Subject: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush > > Hyper-V host can suggest us to use hypercall for doing remote TLB flush, > this is supposed to work faster than IPIs. > > Implementation details: to do HvFlushVirtualAddress{Space,List} hypercalls > we need to put the input somewhere in memory and we don't really want to > have memory allocation on each call so we pre-allocate per cpu memory > areas > on boot. These areas are of fixes size, limit them with an arbitrary number > of 16 (16 gvas are able to specify 16 * 4096 pages). > > pv_ops patching is happening very early so we need to separate > hyperv_setup_mmu_ops() and hyper_alloc_mmu(). > > It is possible and easy to implement local TLB flushing too and there is > even a hint for that. However, I don't see a room for optimization on the > host side as both hypercall and native tlb flush will result in vmexit. The > hint is also not set on modern Hyper-V versions. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > arch/x86/hyperv/Makefile | 2 +- > arch/x86/hyperv/hv_init.c | 2 + > arch/x86/hyperv/mmu.c | 128 > +++++++++++++++++++++++++++++++++++++ > arch/x86/include/asm/mshyperv.h | 2 + > arch/x86/include/uapi/asm/hyperv.h | 7 ++ > arch/x86/kernel/cpu/mshyperv.c | 1 + > 6 files changed, 141 insertions(+), 1 deletion(-) > create mode 100644 arch/x86/hyperv/mmu.c > > diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile > index 171ae09..367a820 100644 > --- a/arch/x86/hyperv/Makefile > +++ b/arch/x86/hyperv/Makefile > @@ -1 +1 @@ > -obj-y := hv_init.o > +obj-y := hv_init.o mmu.o > diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c > index 1c14088..2cf8a98 100644 > --- a/arch/x86/hyperv/hv_init.c > +++ b/arch/x86/hyperv/hv_init.c > @@ -163,6 +163,8 @@ void hyperv_init(void) > hypercall_msr.guest_physical_address = > vmalloc_to_pfn(hv_hypercall_pg); > wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); > > + hyper_alloc_mmu(); > + > /* > * Register Hyper-V specific clocksource. > */ > diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c > new file mode 100644 > index 0000000..fb487cb > --- /dev/null > +++ b/arch/x86/hyperv/mmu.c > @@ -0,0 +1,128 @@ > +#include <linux/types.h> > +#include <linux/hyperv.h> > +#include <linux/slab.h> > +#include <asm/mshyperv.h> > +#include <asm/tlbflush.h> > +#include <asm/msr.h> > +#include <asm/fpu/api.h> > + > +/* > + * Arbitrary number; we need to pre-allocate per-cpu struct for doing TLB > + * flush hypercalls and we need to pick a size. '16' means we'll be able > + * to flush 16 * 4096 pages (256MB) with one hypercall. > + */ > +#define HV_MMU_MAX_GVAS 16 Did you experiment with different sizes here. > + > +/* HvFlushVirtualAddressSpace*, HvFlushVirtualAddressList hypercalls */ > +struct hv_flush_pcpu { > + struct { > + __u64 address_space; > + __u64 flags; > + __u64 processor_mask; > + __u64 gva_list[HV_MMU_MAX_GVAS]; > + } flush; > + > + spinlock_t lock; > +}; > + We may be supporting more than 64 CPUs in this hypercall. I am going to inquire with the Windows folks and get back to you. > +static struct hv_flush_pcpu __percpu *pcpu_flush; > + > +static void hyperv_flush_tlb_others(const struct cpumask *cpus, > + struct mm_struct *mm, unsigned long > start, > + unsigned long end) > +{ > + struct hv_flush_pcpu *flush; > + unsigned long cur, flags; > + u64 status = -1ULL; > + int cpu, vcpu, gva_n; > + > + if (!pcpu_flush || !hv_hypercall_pg) > + goto do_native; > + > + if (cpumask_empty(cpus)) > + return; > + > + flush = this_cpu_ptr(pcpu_flush); > + spin_lock_irqsave(&flush->lock, flags); > + > + flush->flush.address_space = virt_to_phys(mm->pgd); > + flush->flush.processor_mask = 0; > + if (cpumask_equal(cpus, cpu_present_mask)) { > + flush->flush.flags = HV_FLUSH_ALL_PROCESSORS; > + } else { > + flush->flush.flags = 0; > + for_each_cpu(cpu, cpus) { > + vcpu = vmbus_cpu_number_to_vp_number(cpu); > + if (vcpu != -1 && vcpu < 64) > + flush->flush.processor_mask |= 1 << vcpu; > + else > + goto unlock_do_native; > + } > + } > + > + if (end == TLB_FLUSH_ALL) { > + flush->flush.flags = > HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; > + status = > hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, > + &flush->flush, NULL); > + } else { > + cur = start; > +more_gvas: > + gva_n = 0; > + > + do { > + flush->flush.gva_list[gva_n] = cur & PAGE_MASK; > + /* > + * Lower 12 bits encode the number of additional > + * pages to flush (in addition to the 'cur' page). > + */ > + if (end >= cur + PAGE_SIZE * PAGE_SIZE) > + flush->flush.gva_list[gva_n] |= > ~PAGE_MASK; > + else if (end > cur) > + flush->flush.gva_list[gva_n] |= > + (end - cur - 1) >> PAGE_SHIFT; > + > + cur += PAGE_SIZE * PAGE_SIZE; > + ++gva_n; > + > + } while (cur < end && gva_n < HV_MMU_MAX_GVAS); > + > + status = > hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, > + gva_n, &flush->flush, NULL); > + > + if (!(status & 0xffff) && cur < end) > + goto more_gvas; > + } > + > +unlock_do_native: > + spin_unlock_irqrestore(&flush->lock, flags); > + > + if (!(status & 0xffff)) > + return; > +do_native: > + native_flush_tlb_others(cpus, mm, start, end); > +} > + > +void hyperv_setup_mmu_ops(void) > +{ > + if (ms_hyperv.hints & > HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) { > + pr_info("Hyper-V: Using hypercall for remote TLB flush\n"); > + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; > + } > +} > + > +void hyper_alloc_mmu(void) > +{ > + int cpu; > + struct hv_flush_pcpu *flush; > + > + if (ms_hyperv.hints & > HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) { > + pcpu_flush = alloc_percpu(struct hv_flush_pcpu); > + if (!pcpu_flush) > + return; > + > + for_each_possible_cpu(cpu) { > + flush = per_cpu_ptr(pcpu_flush, cpu); > + spin_lock_init(&flush->lock); > + } > + } > +} > diff --git a/arch/x86/include/asm/mshyperv.h > b/arch/x86/include/asm/mshyperv.h > index 1293c84..a5041c3 100644 > --- a/arch/x86/include/asm/mshyperv.h > +++ b/arch/x86/include/asm/mshyperv.h > @@ -301,6 +301,8 @@ static inline int > vmbus_cpu_number_to_vp_number(int cpu_number) > } > > void hyperv_init(void); > +void hyperv_setup_mmu_ops(void); > +void hyper_alloc_mmu(void); > void hyperv_report_panic(struct pt_regs *regs); > bool hv_is_hypercall_page_setup(void); > void hyperv_cleanup(void); > diff --git a/arch/x86/include/uapi/asm/hyperv.h > b/arch/x86/include/uapi/asm/hyperv.h > index c87e900..3d44036 100644 > --- a/arch/x86/include/uapi/asm/hyperv.h > +++ b/arch/x86/include/uapi/asm/hyperv.h > @@ -239,6 +239,8 @@ > (~((1ull << > HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) > > /* Declare the various hypercall operations. */ > +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 > +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 > #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 > #define HVCALL_POST_MESSAGE 0x005c > #define HVCALL_SIGNAL_EVENT 0x005d > @@ -256,6 +258,11 @@ > #define HV_PROCESSOR_POWER_STATE_C2 2 > #define HV_PROCESSOR_POWER_STATE_C3 3 > > +#define HV_FLUSH_ALL_PROCESSORS 0x00000001 > +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES 0x00000002 > +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY 0x00000004 > +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT 0x00000008 > + > /* Hypercall interface */ > union hv_hypercall_input { > u64 as_uint64; > diff --git a/arch/x86/kernel/cpu/mshyperv.c > b/arch/x86/kernel/cpu/mshyperv.c > index 04cb8d3..fc228d8 100644 > --- a/arch/x86/kernel/cpu/mshyperv.c > +++ b/arch/x86/kernel/cpu/mshyperv.c > @@ -233,6 +233,7 @@ static void __init ms_hyperv_init_platform(void) > * Setup the hook to get control post apic initialization. > */ > x86_platform.apic_post_init = hyperv_init; > + hyperv_setup_mmu_ops(); > #endif > } > > -- > 2.9.3 _______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel