Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> writes: > mikelley@xxxxxxxxxxxxxxxxxxxxxx writes: > >> From: Michael Kelley <mikelley@xxxxxxxxxxxxx> >> >> The 2016 version of Hyper-V offers the option to operate the guest VM >> per-vcpu stimer's in Direct Mode, which means the timer interupts on its >> own vector rather than queueing a VMbus message. Direct Mode reduces >> timer processing overhead in both the hypervisor and the guest, and >> avoids having timer interrupts pollute the VMbus interrupt stream for >> the synthetic NIC and storage. This patch enables Direct Mode by >> default on stimer0 (which is the only stimer used in Linux on Hyper-V) >> when running on a version of Hyper-V that supports it, with a hv_vmbus >> module parameter for disabling Direct Mode and reverting to the old >> behavior. >> >> Signed-off-by: Michael Kelley <mikelley@xxxxxxxxxxxxx> >> --- >> arch/x86/include/asm/mshyperv.h | 14 ++++++++ >> arch/x86/include/uapi/asm/hyperv.h | 26 ++++++++++++++ >> arch/x86/kernel/cpu/mshyperv.c | 64 +++++++++++++++++++++++++++++++++- >> drivers/hv/hv.c | 71 ++++++++++++++++++++++++++++++++++++-- >> drivers/hv/hyperv_vmbus.h | 4 ++- >> 5 files changed, 175 insertions(+), 4 deletions(-) >> (in addition to my previous comment) This patch is also x86-heavy so it probably makes sense to make x86 maintainers (Thomas, Peter, Ingo) aware no matter which tree it will go through. CC: x86@xxxxxxxxxx >> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h >> index 740dc97..1bba1d7 100644 >> --- a/arch/x86/include/asm/mshyperv.h >> +++ b/arch/x86/include/asm/mshyperv.h >> @@ -4,6 +4,8 @@ >> #include <linux/types.h> >> #include <linux/atomic.h> >> #include <linux/nmi.h> >> +#include <linux/irq.h> >> +#include <linux/irqdesc.h> >> #include <asm/io.h> >> #include <asm/hyperv.h> >> >> @@ -374,4 +376,16 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) >> return NULL; >> } >> #endif >> + >> +/* Per architecture routines for stimer0 Direct Mode handling. On x86/x64 >> + * there are no percpu actions to take. >> + */ >> +#if IS_ENABLED(CONFIG_HYPERV) >> +static inline void hv_enable_stimer0_percpu_irq(int irq) { } >> +static inline void hv_disable_stimer0_percpu_irq(int irq) { } >> +extern int hv_allocate_stimer0_irq(int *irq, int *vector); >> +extern void hv_deallocate_stimer0_irq(int irq); >> +extern void hv_ack_stimer0_interrupt(struct irq_desc *desc); >> +#endif >> + >> #endif >> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h >> index f65d125..408cf3e 100644 >> --- a/arch/x86/include/uapi/asm/hyperv.h >> +++ b/arch/x86/include/uapi/asm/hyperv.h >> @@ -112,6 +112,22 @@ >> #define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) >> /* Guest crash data handler available */ >> #define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) >> +/* Debug MSRs available */ >> +#define HV_X64_DEBUG_MSR_AVAILABLE (1 << 11) >> +/* Support for Non-Privileged Instruction Execution Prevention is available */ >> +#define HV_X64_NPIEP_AVAILABLE (1 << 12) >> +/* Support for DisableHypervisor is available */ >> +#define HV_X64_DISABLE_HYPERVISOR_AVAILABLE (1 << 13) >> +/* Extended GVA Ranges for Flush Virtual Address list is available */ >> +#define HV_X64_EXTENDED_GVA_RANGE_AVAILABLE (1 << 14) >> +/* Return Hypercall output via XMM registers is available */ >> +#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE (1 << 15) >> +/* SINT polling mode available */ >> +#define HV_X64_SINT_POLLING_MODE_AVAILABLE (1 << 17) >> +/* Hypercall MSR lock is available */ >> +#define HV_X64_HYPERCALL_MSR_LOCK_AVAILABLE (1 << 18) >> +/* stimer direct mode is available */ >> +#define HV_X64_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) >> >> /* >> * Implementation recommendations. Indicates which behaviors the hypervisor >> @@ -300,6 +316,16 @@ enum HV_GENERIC_SET_FORMAT { >> >> #define HV_SYNIC_STIMER_COUNT (4) >> >> +/* Hardware IRQ number to use for stimer0 in Direct Mode. This IRQ is a fake >> + * because stimer's in Direct Mode simply interrupt on the specified vector, >> + * without using a particular IOAPIC pin. But we use the IRQ allocation >> + * machinery, so we need a hardware IRQ #. This value is somewhat arbitrary, >> + * but it should not be a legacy IRQ (0 to 15), and should fit within the >> + * single IOAPIC (0 to 23) that Hyper-V provides to a guest VM. So any value >> + * between 16 and 23 should be good. >> + */ > > (nitpick): all comments in the patch are in 'net' style: > > /* This is a > * comment. > */ > > While in Hyper-V files (and x86 in general) the 'normal' style is used: > > /* > * This is a > * comment. > */ > > But checkpatch.pl doesn't complain. > >> +#define HV_STIMER0_IRQNR 18 >> + >> /* Define synthetic interrupt controller message constants. */ >> #define HV_MESSAGE_SIZE (256) >> #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) >> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c >> index 236324e8..88dc243 100644 >> --- a/arch/x86/kernel/cpu/mshyperv.c >> +++ b/arch/x86/kernel/cpu/mshyperv.c >> @@ -19,7 +19,10 @@ >> #include <linux/efi.h> >> #include <linux/interrupt.h> >> #include <linux/irq.h> >> +#include <linux/irqdomain.h> >> +#include <linux/irqdesc.h> >> #include <linux/kexec.h> >> +#include <linux/acpi.h> >> #include <asm/processor.h> >> #include <asm/hypervisor.h> >> #include <asm/hyperv.h> >> @@ -27,6 +30,7 @@ >> #include <asm/desc.h> >> #include <asm/irq_regs.h> >> #include <asm/i8259.h> >> +#include <asm/irqdomain.h> >> #include <asm/apic.h> >> #include <asm/timer.h> >> #include <asm/reboot.h> >> @@ -69,6 +73,64 @@ void hv_remove_vmbus_irq(void) >> EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); >> EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); >> >> + >> +/* Routines to do per-architecture handling of stimer0 when in Direct Mode */ >> + >> +void hv_ack_stimer0_interrupt(struct irq_desc *desc) >> +{ >> + ack_APIC_irq(); >> +} >> + >> +static void allonline_vector_allocation_domain(int cpu, struct cpumask *retmask, >> + const struct cpumask *mask) >> +{ >> + cpumask_copy(retmask, cpu_online_mask); >> +} >> + >> +int hv_allocate_stimer0_irq(int *irq, int *vector) >> +{ >> + int localirq; >> + int result; >> + struct irq_data *irq_data; >> + >> + /* The normal APIC vector allocation domain allows allocation of vectors >> + * only for the calling CPU. So we change the allocation domain to one >> + * that allows vectors to be allocated in all online CPUs. This >> + * change is fine in a Hyper-V VM because VMs don't have the usual >> + * complement of interrupting devices. >> + */ >> + apic->vector_allocation_domain = allonline_vector_allocation_domain; >> + localirq = acpi_register_gsi(NULL, HV_STIMER0_IRQNR, >> + ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH); >> + if (localirq < 0) { >> + pr_err("Cannot register stimer0 gsi. Error %d", localirq); >> + return -1; >> + } >> + >> + /* We pass in a dummy IRQ handler because architecture independent code >> + * will later override the IRQ domain interrupt handler and set it to a >> + * Hyper-V specific handler. >> + */ >> + result = request_irq(localirq, (irq_handler_t)(-1), 0, >> + "Hyper-V stimer0", NULL); > > I remember K. Y. told me he has patches in his stash to implement > Hyper-V-specific APIC. Is this still in works or there is a reason to > not do it? Just curious. > >> + if (result) { >> + pr_err("Cannot request stimer0 irq. Error %d", result); >> + acpi_unregister_gsi(localirq); >> + return -1; >> + } >> + irq_data = irq_domain_get_irq_data(x86_vector_domain, localirq); > > In theory irq_domain_get_irq_data() can return NULL. > >> + *vector = irqd_cfg(irq_data)->vector; >> + *irq = localirq; >> + return 0; >> +} >> + >> +void hv_deallocate_stimer0_irq(int irq) >> +{ >> + free_irq(irq, NULL); >> + acpi_unregister_gsi(irq); >> +} >> + >> + >> void hv_setup_kexec_handler(void (*handler)(void)) >> { >> hv_kexec_handler = handler; >> @@ -195,7 +257,7 @@ static void __init ms_hyperv_init_platform(void) >> hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION); >> hv_host_info_edx = cpuid_edx(HVCPUID_VERSION); >> >> - pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", >> + pr_info("Hyper-V: Host Build %d-%d.%d-%d-%d.%d\n", > > While this definitely makes logs nicer uhe change is probably not needed > in this particular patch. > >> hv_host_info_eax, hv_host_info_ebx >> 16, >> hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, >> hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); >> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c >> index fe96aab..68ac474 100644 >> --- a/drivers/hv/hv.c >> +++ b/drivers/hv/hv.c >> @@ -27,8 +27,12 @@ >> #include <linux/vmalloc.h> >> #include <linux/hyperv.h> >> #include <linux/version.h> >> -#include <linux/interrupt.h> >> +#include <linux/irq.h> >> +#include <linux/irqdesc.h> >> +#include <linux/random.h> >> +#include <linux/kernel_stat.h> >> #include <linux/clockchips.h> >> +#include <linux/module.h> >> #include <asm/hyperv.h> >> #include <asm/mshyperv.h> >> #include "hyperv_vmbus.h" >> @@ -38,6 +42,21 @@ struct hv_context hv_context = { >> .synic_initialized = false, >> }; >> >> +/* If true, we're using Direct Mode for stimer0, and the timer will do it own >> + * interrupt when it expires. If false, stimer0 is not using Direct Mode and >> + * will send a VMbus message when it expires. We prefer to use Direct Mode, >> + * but not all versions of Hyper-V support Direct Mode. >> + * >> + * While Hyper-V provides a total of four stimer's per CPU, Linux use only >> + * stimer0. >> + */ >> +static bool stimer_direct_mode; >> +static int stimer0_irq; >> +static int stimer0_vector; >> +static bool direct_mode_disable; >> +module_param(direct_mode_disable, bool, 0444); >> +MODULE_PARM_DESC(direct_mode_disable, "Set to Y to disable Direct Mode."); >> + >> #define HV_TIMER_FREQUENCY (10 * 1000 * 1000) /* 100ns period */ >> #define HV_MAX_MAX_DELTA_TICKS 0xffffffff >> #define HV_MIN_DELTA_TICKS 1 >> @@ -52,7 +71,12 @@ int hv_init(void) >> hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context); >> if (!hv_context.cpu_context) >> return -ENOMEM; >> + stimer_direct_mode = (ms_hyperv.misc_features & >> + HV_X64_STIMER_DIRECT_MODE_AVAILABLE) ? true : false; >> >> + /* Apply boot command line override to the Direct Mode setting */ >> + if (direct_mode_disable) >> + stimer_direct_mode = false; > > Having both direct_mode_disable and stimer_direct_mode seems redundant. > > we may write something like > > if (!direct_mode_disable) > direct_mode_disable = (ms_hyperv.misc_features & HV_X64_STIMER_DIRECT_MODE_AVAILABLE) ? false : true; > > and use it everywhere. > >> return 0; >> } >> >> @@ -91,6 +115,23 @@ int hv_post_message(union hv_connection_id connection_id, >> return status & 0xFFFF; >> } >> >> +/* ISR for when stimer0 is operating in Direct Mode. Direct Mode does >> + * not use VMBus or any VMBus messages, so process here and not in the >> + * VMBus driver code. >> + */ >> + >> +static void hv_stimer0_isr(struct irq_desc *desc) >> +{ >> + struct hv_per_cpu_context *hv_cpu; >> + >> + __this_cpu_inc(*desc->kstat_irqs); >> + __this_cpu_inc(kstat.irqs_sum); >> + hv_ack_stimer0_interrupt(desc); >> + hv_cpu = this_cpu_ptr(hv_context.cpu_context); >> + hv_cpu->clk_evt->event_handler(hv_cpu->clk_evt); >> + add_interrupt_randomness(desc->irq_data.irq, 0); > > AFAIU implementing Hyper-V specific IRQ chip would allow us to drop this > as we would do this on 'normal' Linux IRQ path. > >> +} >> + >> static int hv_ce_set_next_event(unsigned long delta, >> struct clock_event_device *evt) >> { >> @@ -108,6 +149,8 @@ static int hv_ce_shutdown(struct clock_event_device *evt) >> { >> hv_init_timer(HV_X64_MSR_STIMER0_COUNT, 0); >> hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, 0); >> + if (stimer_direct_mode) >> + hv_disable_stimer0_percpu_irq(stimer0_irq); >> > > Both hv_disable_stimer0_percpu_irq() and hv_enable_stimer0_percpu_irq() > are no-ops, right? Why don't we mask/unmask the IRQ? I may have missed > something... > >> return 0; >> } >> @@ -116,9 +159,24 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt) >> { >> union hv_timer_config timer_cfg; >> >> + timer_cfg.as_uint64 = 0; /* Zero everything */ >> timer_cfg.enable = 1; >> timer_cfg.auto_enable = 1; >> - timer_cfg.sintx = VMBUS_MESSAGE_SINT; >> + if (stimer_direct_mode) { >> + >> + /* When it expires, the timer will directly interrupt >> + * on the specific hardware vector. >> + */ >> + timer_cfg.direct_mode = 1; >> + timer_cfg.apic_vector = stimer0_vector; >> + hv_enable_stimer0_percpu_irq(stimer0_irq); >> + } else { >> + /* When it expires, the timer will generate a VMbus message, >> + * to be handled by the normal VMbus interrupt handler. >> + */ >> + timer_cfg.direct_mode = 0; >> + timer_cfg.sintx = VMBUS_MESSAGE_SINT; >> + } >> hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); >> >> return 0; >> @@ -191,6 +249,12 @@ int hv_synic_alloc(void) >> INIT_LIST_HEAD(&hv_cpu->chan_list); >> } >> >> + if (stimer_direct_mode) { >> + if (hv_allocate_stimer0_irq(&stimer0_irq, &stimer0_vector)) >> + goto err; >> + irq_set_handler(stimer0_irq, hv_stimer0_isr); >> + } >> + >> return 0; >> err: >> return -ENOMEM; >> @@ -292,6 +356,9 @@ void hv_synic_clockevents_cleanup(void) >> if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)) >> return; >> >> + if (stimer_direct_mode) >> + hv_deallocate_stimer0_irq(stimer0_irq); >> + >> for_each_present_cpu(cpu) { >> struct hv_per_cpu_context *hv_cpu >> = per_cpu_ptr(hv_context.cpu_context, cpu); >> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h >> index de6f01d..ee8c89b 100644 >> --- a/drivers/hv/hyperv_vmbus.h >> +++ b/drivers/hv/hyperv_vmbus.h >> @@ -55,7 +55,9 @@ >> u64 periodic:1; >> u64 lazy:1; >> u64 auto_enable:1; >> - u64 reserved_z0:12; >> + u64 apic_vector:8; >> + u64 direct_mode:1; >> + u64 reserved_z0:3; >> u64 sintx:4; >> u64 reserved_z1:44; >> }; -- Vitaly _______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel