Linux has support for free page reporting now (36e66c554b5c) for virtualized environment. On Hyper-V when virtually backed VMs are configured, Hyper-V will advertise cold memory discard capability, when supported. This patch adds the support to hook into the free page reporting infrastructure and leverage the Hyper-V cold memory discard hint hypercall to report/free these pages back to the host. Signed-off-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> Tested-by: Matheus Castello <matheus@xxxxxxxxxxxxxxx> --- In V2: - Addressed feedback comments - Added page reporting config option tied to hyper-v balloon config In V3: - Addressed feedback from Vitaly In V4: - Queried and cached the Hyper-V extended capability for the lifetime of the VM - Addressed feedback from Michael Kelley. In v5: - Added a comment clarifying handling of failed query extended capability hypercall to address Michael's feedback. --- arch/x86/hyperv/hv_init.c | 51 +++++++++++++++++- arch/x86/kernel/cpu/mshyperv.c | 9 ++-- drivers/hv/Kconfig | 1 + drivers/hv/hv_balloon.c | 89 +++++++++++++++++++++++++++++++ include/asm-generic/hyperv-tlfs.h | 35 +++++++++++- include/asm-generic/mshyperv.h | 3 +- 6 files changed, 180 insertions(+), 8 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 9d100257b3af..7c9da3f65afa 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -498,6 +498,8 @@ void __init hyperv_init(void) x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; #endif + /* Query the VMs extended capability once, so that it can be cached. */ + hv_query_ext_cap(0); return; remove_cpuhp_state: @@ -601,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); enum hv_isolation_type hv_get_isolation_type(void) { - if (!(ms_hyperv.features_b & HV_ISOLATION)) + if (!(ms_hyperv.priv_high & HV_ISOLATION)) return HV_ISOLATION_TYPE_NONE; return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); } @@ -612,3 +614,50 @@ bool hv_is_isolation_supported(void) return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; } EXPORT_SYMBOL_GPL(hv_is_isolation_supported); + +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ +bool hv_query_ext_cap(u64 cap_query) +{ + /* + * The address of the 'hv_extended_cap' variable will be used as an + * output parameter to the hypercall below and so it should be + * compatible with 'virt_to_phys'. Which means, it's address should be + * directly mapped. Use 'static' to keep it compatible; stack variables + * can be virtually mapped, making them imcompatible with + * 'virt_to_phys'. + * Hypercall input/output addresses should also be 8-byte aligned. + */ + static u64 hv_extended_cap __aligned(8); + static bool hv_extended_cap_queried; + u64 status; + + /* + * Querying extended capabilities is an extended hypercall. Check if the + * partition supports extended hypercall, first. + */ + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) + return false; + + /* Extended capabilities do not change at runtime. */ + if (hv_extended_cap_queried) + return hv_extended_cap & cap_query; + + status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, + &hv_extended_cap); + + /* + * The query extended capabilities hypercall should not fail under + * any normal circumstances. Avoid repeatedly making the hypercall, on + * error. + */ + hv_extended_cap_queried = true; + status &= HV_HYPERCALL_RESULT_MASK; + if (status != HV_STATUS_SUCCESS) { + pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", + status); + return false; + } + + return hv_extended_cap & cap_query; +} +EXPORT_SYMBOL_GPL(hv_query_ext_cap); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index cebed535ec56..3546d3e21787 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -265,12 +265,13 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); - ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -316,7 +317,7 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } - if (ms_hyperv.features_b & HV_ISOLATION) { + if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 79e5356a737a..66c794d92391 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -23,6 +23,7 @@ config HYPERV_UTILS config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" depends on HYPERV + select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 2f776d78e3c1..58af84e30144 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -21,6 +21,7 @@ #include <linux/memory.h> #include <linux/notifier.h> #include <linux/percpu_counter.h> +#include <linux/page_reporting.h> #include <linux/hyperv.h> #include <asm/hyperv-tlfs.h> @@ -563,6 +564,8 @@ struct hv_dynmem_device { * The negotiated version agreed by host. */ __u32 version; + + struct page_reporting_dev_info pr_dev_info; }; static struct hv_dynmem_device dm_device; @@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context) } +/* Hyper-V only supports reporting 2MB pages or higher */ +#define HV_MIN_PAGE_REPORTING_ORDER 9 +#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER) +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, + struct scatterlist *sgl, unsigned int nents) +{ + unsigned long flags; + struct hv_memory_hint *hint; + int i; + u64 status; + struct scatterlist *sg; + + WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); + WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN); + local_irq_save(flags); + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); + if (!hint) { + local_irq_restore(flags); + return -ENOSPC; + } + + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD; + hint->reserved = 0; + for_each_sg(sgl, sg, nents, i) { + union hv_gpa_page_range *range; + + range = &hint->ranges[i]; + range->address_space = 0; + /* page reporting only reports 2MB pages or higher */ + range->page.largepage = 1; + range->page.additional_pages = + (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1; + range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; + range->base_large_pfn = + page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER; + } + + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, + hint, NULL); + local_irq_restore(flags); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + pr_err("Cold memory discard hypercall failed with status %llx\n", + status); + return -EINVAL; + } + + return 0; +} + +static void enable_page_reporting(void) +{ + int ret; + + /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */ + if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) { + pr_debug("Cold memory discard is only supported on 2MB pages and above\n"); + return; + } + + if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) { + pr_debug("Cold memory discard hint not supported by Hyper-V\n"); + return; + } + + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); + dm_device.pr_dev_info.report = hv_free_page_report; + ret = page_reporting_register(&dm_device.pr_dev_info); + if (ret < 0) { + dm_device.pr_dev_info.report = NULL; + pr_err("Failed to enable cold memory discard: %d\n", ret); + } else { + pr_info("Cold memory discard hint enabled\n"); + } +} + +static void disable_page_reporting(void) +{ + if (dm_device.pr_dev_info.report) { + page_reporting_unregister(&dm_device.pr_dev_info); + dm_device.pr_dev_info.report = NULL; + } +} + static int balloon_connect_vsp(struct hv_device *dev) { struct dm_version_request version_req; @@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev, if (ret != 0) return ret; + enable_page_reporting(); dm_device.state = DM_INITIALIZED; dm_device.thread = @@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev, probe_error: dm_device.state = DM_INIT_ERROR; dm_device.thread = NULL; + disable_page_reporting(); vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); @@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->ha_wrk.wrk); kthread_stop(dm->thread); + disable_page_reporting(); vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 9cf10837d005..515c3fb06ab3 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -89,9 +89,9 @@ #define HV_ACCESS_STATS BIT(8) #define HV_DEBUGGING BIT(11) #define HV_CPU_MANAGEMENT BIT(12) +#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) #define HV_ISOLATION BIT(22) - /* * TSC page layout. */ @@ -159,11 +159,18 @@ struct ms_hyperv_tsc_page { #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 +/* Extended hypercalls */ +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 +#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 + #define HV_FLUSH_ALL_PROCESSORS BIT(0) #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) +/* Extended capability bits */ +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8) + enum HV_GENERIC_SET_FORMAT { HV_GENERIC_SET_SPARSE_4K, HV_GENERIC_SET_ALL, @@ -408,8 +415,10 @@ struct hv_guest_mapping_flush { * by the bitwidth of "additional_pages" in union hv_gpa_page_range. */ #define HV_MAX_FLUSH_PAGES (2048) +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0 +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1 -/* HvFlushGuestPhysicalAddressList hypercall */ +/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */ union hv_gpa_page_range { u64 address_space; struct { @@ -417,6 +426,12 @@ union hv_gpa_page_range { u64 largepage:1; u64 basepfn:52; } page; + struct { + u64 reserved:12; + u64 page_size:1; + u64 reserved1:8; + u64 base_large_pfn:43; + }; }; /* @@ -774,4 +789,20 @@ struct hv_input_unmap_device_interrupt { #define HV_SOURCE_SHADOW_NONE 0x0 #define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 +/* + * The whole argument should fit in a page to be able to pass to the hypervisor + * in one hypercall. + */ +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \ + sizeof(union hv_gpa_page_range)) + +/* HvExtCallMemoryHeatHint hypercall */ +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2 +struct hv_memory_hint { + u64 type:2; + u64 reserved:62; + union hv_gpa_page_range ranges[]; +} __packed; + #endif diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 2d1b6cd9f000..63c0e579bf6d 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -27,7 +27,7 @@ struct ms_hyperv_info { u32 features; - u32 features_b; + u32 priv_high; u32 misc_features; u32 hints; u32 nested_features; @@ -179,6 +179,7 @@ bool hv_is_hibernation_supported(void); enum hv_isolation_type hv_get_isolation_type(void); bool hv_is_isolation_supported(void); void hyperv_cleanup(void); +bool hv_query_ext_cap(u64 cap_query); #else /* CONFIG_HYPERV */ static inline bool hv_is_hyperv_initialized(void) { return false; } static inline bool hv_is_hibernation_supported(void) { return false; } -- 2.17.1