lantianyu1986@xxxxxxxxx writes: > From: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx> > > Hyper-V provides dynamic memory hot add/remove function. > Memory hot-add has already enabled in Hyper-V balloon driver. > Now add memory hot-remove function. > > When driver receives hot-remove msg, it first checks whether > request remove page number is aligned with hot plug unit(128MB). > If there are remainder pages(pages%128MB), handle remainder pages > via balloon way(allocate pages, offline pages and return back to > Hyper-V). > > To remove memory chunks, search memory in the hot add blocks first > and then other system memory. > > Hyper-V has a bug of sending unballoon msg to request memory > hot-add after doing memory hot-remove. Fix it to handle all > unballoon msg with memory hot-add operation. > > Signed-off-by: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx> > --- > drivers/hv/hv_balloon.c | 686 +++++++++++++++++++++++++++++++++++++++++++----- This patch is too big to review and the logic in it is not trivial at all. Please try to split this into a series so we can take a look. > 1 file changed, 616 insertions(+), 70 deletions(-) > > diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c > index 4d1a3b1e2490..015e9e993188 100644 > --- a/drivers/hv/hv_balloon.c > +++ b/drivers/hv/hv_balloon.c > @@ -19,6 +19,7 @@ > #include <linux/completion.h> > #include <linux/memory_hotplug.h> > #include <linux/memory.h> > +#include <linux/memblock.h> > #include <linux/notifier.h> > #include <linux/percpu_counter.h> > > @@ -46,12 +47,17 @@ > * Changes to 0.2 on 2009/05/14 > * Changes to 0.3 on 2009/12/03 > * Changed to 1.0 on 2011/04/05 > + * Changed to 2.0 on 2019/12/10 > */ > > #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) > #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) > #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) > > +#define MAX_HOT_REMOVE_ENTRIES \ > + ((PAGE_SIZE - sizeof(struct dm_hot_remove_response)) \ > + / sizeof(union dm_mem_page_range)) > + > enum { > DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), > DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), > @@ -91,7 +97,13 @@ enum dm_message_type { > * Version 1.0. > */ > DM_INFO_MESSAGE = 12, > - DM_VERSION_1_MAX = 12 > + DM_VERSION_1_MAX = 12, > + > + /* > + * Version 2.0 > + */ > + DM_MEM_HOT_REMOVE_REQUEST = 13, > + DM_MEM_HOT_REMOVE_RESPONSE = 14 > }; > > > @@ -120,7 +132,8 @@ union dm_caps { > * represents an alignment of 2^n in mega bytes. > */ > __u64 hot_add_alignment:4; > - __u64 reservedz:58; > + __u64 hot_remove:1; > + __u64 reservedz:57; > } cap_bits; > __u64 caps; > } __packed; > @@ -231,7 +244,9 @@ struct dm_capabilities { > struct dm_capabilities_resp_msg { > struct dm_header hdr; > __u64 is_accepted:1; > - __u64 reservedz:63; > + __u64 hot_remove:1; > + __u64 suppress_pressure_reports:1; > + __u64 reservedz:61; > } __packed; > > /* > @@ -376,6 +391,27 @@ struct dm_hot_add_response { > __u32 result; > } __packed; > > +struct dm_hot_remove { > + struct dm_header hdr; > + __u32 virtual_node; > + __u32 page_count; > + __u32 qos_flags; > + __u32 reservedZ; > +} __packed; > + > +struct dm_hot_remove_response { > + struct dm_header hdr; > + __u32 result; > + __u32 range_count; > + __u64 more_pages:1; > + __u64 reservedz:63; > + union dm_mem_page_range range_array[]; > +} __packed; > + > +#define DM_REMOVE_QOS_LARGE (1 << 0) > +#define DM_REMOVE_QOS_LOCAL (1 << 1) > +#define DM_REMOVE_QoS_MASK (0x3) Capitalize 'QoS' to make it match previous two lines please. > + > /* > * Types of information sent from host to the guest. > */ > @@ -457,6 +493,13 @@ struct hot_add_wrk { > struct work_struct wrk; > }; > > +struct hot_remove_wrk { > + __u32 virtual_node; > + __u32 page_count; > + __u32 qos_flags; > + struct work_struct wrk; > +}; > + > static bool hot_add = true; > static bool do_hot_add; > /* > @@ -489,6 +532,7 @@ enum hv_dm_state { > DM_BALLOON_UP, > DM_BALLOON_DOWN, > DM_HOT_ADD, > + DM_HOT_REMOVE, > DM_INIT_ERROR > }; > > @@ -515,11 +559,13 @@ struct hv_dynmem_device { > * State to manage the ballooning (up) operation. > */ > struct balloon_state balloon_wrk; > + struct balloon_state unballoon_wrk; > > /* > * State to execute the "hot-add" operation. This comment is stale now. > */ > struct hot_add_wrk ha_wrk; > + struct hot_remove_wrk hr_wrk; Do we actually want to work struct and all the problems with their serialization? Can we get away with one? > > /* > * This state tracks if the host has specified a hot-add > @@ -569,6 +615,42 @@ static struct hv_dynmem_device dm_device; > > static void post_status(struct hv_dynmem_device *dm); > > +static int hv_send_hot_remove_response( > + struct dm_hot_remove_response *resp, > + long array_index, bool more_pages) > +{ > + struct hv_dynmem_device *dm = &dm_device; > + int ret; > + > + resp->hdr.type = DM_MEM_HOT_REMOVE_RESPONSE; > + resp->range_count = array_index; > + resp->more_pages = more_pages; > + resp->hdr.size = sizeof(struct dm_hot_remove_response) > + + sizeof(union dm_mem_page_range) * array_index; > + > + if (array_index) > + resp->result = 0; > + else > + resp->result = 1; > + > + do { > + resp->hdr.trans_id = atomic_inc_return(&trans_id); > + ret = vmbus_sendpacket(dm->dev->channel, resp, > + resp->hdr.size, > + (unsigned long)NULL, > + VM_PKT_DATA_INBAND, 0); > + > + if (ret == -EAGAIN) > + msleep(20); > + post_status(&dm_device); > + } while (ret == -EAGAIN); > + > + if (ret) > + pr_err("Fail to send hot-remove response msg.\n"); > + > + return ret; > +} > + > #ifdef CONFIG_MEMORY_HOTPLUG > static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, > unsigned long pfn) > @@ -628,7 +710,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, > void *v) > { > struct memory_notify *mem = (struct memory_notify *)v; > - unsigned long flags, pfn_count; > + unsigned long pfn_count; > + unsigned long flags = 0; > + int unlocked; > > switch (val) { > case MEM_ONLINE: > @@ -640,7 +724,11 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, > break; > > case MEM_OFFLINE: > - spin_lock_irqsave(&dm_device.ha_lock, flags); > + if (dm_device.lock_thread != current) { > + spin_lock_irqsave(&dm_device.ha_lock, flags); > + unlocked = 1; > + } > + > pfn_count = hv_page_offline_check(mem->start_pfn, > mem->nr_pages); > if (pfn_count <= dm_device.num_pages_onlined) { > @@ -654,7 +742,10 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, > WARN_ON_ONCE(1); > dm_device.num_pages_onlined = 0; > } > - spin_unlock_irqrestore(&dm_device.ha_lock, flags); > + > + if (unlocked) > + spin_unlock_irqrestore(&dm_device.ha_lock, flags); > + > break; > case MEM_GOING_ONLINE: > case MEM_GOING_OFFLINE: > @@ -727,9 +818,17 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, > init_completion(&dm_device.ol_waitevent); > dm_device.ha_waiting = !memhp_auto_online; > > - nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); > - ret = add_memory(nid, PFN_PHYS((start_pfn)), > - (HA_CHUNK << PAGE_SHIFT)); > + /* > + * If memory section of hot add region is online, > + * just bring pages online in the region. > + */ > + if (online_section_nr(pfn_to_section_nr(start_pfn))) { > + hv_bring_pgs_online(has, start_pfn, processed_pfn); > + } else { > + nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); > + ret = add_memory(nid, PFN_PHYS((start_pfn)), > + (HA_CHUNK << PAGE_SHIFT)); > + } > > if (ret) { > pr_err("hot_add memory failed error is %d\n", ret); > @@ -765,8 +864,8 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, > static void hv_online_page(struct page *pg, unsigned int order) > { > struct hv_hotadd_state *has; > - unsigned long flags; > unsigned long pfn = page_to_pfn(pg); > + unsigned long flags = 0; Why is this change needed? > int unlocked; > > if (dm_device.lock_thread != current) { > @@ -806,10 +905,12 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) > continue; > > /* > - * If the current start pfn is not where the covered_end > - * is, create a gap and update covered_end_pfn. > + * If the current start pfn is great than covered_end_pfn, > + * create a gap and update covered_end_pfn. Start pfn may > + * locate at gap which is created during hot remove. The > + * gap range is less than covered_end_pfn. > */ > - if (has->covered_end_pfn != start_pfn) { > + if (has->covered_end_pfn < start_pfn) { > gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); > if (!gap) { > ret = -ENOMEM; > @@ -848,6 +949,91 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) > return ret; > } > > +static int handle_hot_add_in_gap(unsigned long start, unsigned long pg_cnt, > + struct hv_hotadd_state *has) > +{ > + struct hv_hotadd_gap *gap, *new_gap, *tmp_gap; > + unsigned long pfn_cnt = pg_cnt; > + unsigned long start_pfn = start; > + unsigned long end_pfn; > + unsigned long pages; > + unsigned long pgs_ol; > + unsigned long block_pages = HA_CHUNK; > + unsigned long pfn; > + int nid; > + int ret; > + > + list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { > + > + if ((start_pfn < gap->start_pfn) > + || (start_pfn >= gap->end_pfn)) > + continue; > + > + end_pfn = min(gap->end_pfn, start_pfn + pfn_cnt); > + pgs_ol = end_pfn - start_pfn; > + > + /* > + * hv_bring_pgs_online() identifies whether pfn > + * should be online or not via checking pfn is in > + * hot add covered range or gap range(Detail see > + * has_pfn_is_backed()). So adjust gap before bringing > + * online or add memory. > + */ > + if (gap->end_pfn - gap->start_pfn == pgs_ol) { > + list_del(&gap->list); > + kfree(gap); > + } else if (gap->start_pfn < start && gap->end_pfn == end_pfn) { > + gap->end_pfn = start_pfn; > + } else if (gap->end_pfn > end_pfn > + && gap->start_pfn == start_pfn) { > + gap->start_pfn = end_pfn; > + } else { > + gap->end_pfn = start_pfn; > + > + new_gap = kzalloc(sizeof(struct hv_hotadd_gap), > + GFP_ATOMIC); > + if (!new_gap) { > + do_hot_add = false; > + return -ENOMEM; > + } > + > + INIT_LIST_HEAD(&new_gap->list); > + new_gap->start_pfn = end_pfn; > + new_gap->end_pfn = gap->end_pfn; > + list_add_tail(&gap->list, &has->gap_list); > + } > + > + /* Bring online or add memmory in gaps. */ > + for (pfn = start_pfn; pfn < end_pfn; > + pfn = round_up(pfn + 1, block_pages)) { > + pages = min(round_up(pfn + 1, block_pages), > + end_pfn) - pfn; > + > + if (online_section_nr(pfn_to_section_nr(pfn))) { > + hv_bring_pgs_online(has, pfn, pages); > + } else { > + nid = memory_add_physaddr_to_nid(PFN_PHYS(pfn)); > + ret = add_memory(nid, PFN_PHYS(pfn), > + round_up(pages, block_pages) > + << PAGE_SHIFT); > + if (ret) { > + pr_err("Fail to add memory in gaps(error=%d).\n", > + ret); > + do_hot_add = false; > + return ret; > + } > + } > + } > + > + start_pfn += pgs_ol; > + pfn_cnt -= pgs_ol; > + if (!pfn_cnt) > + break; > + } > + > + return pg_cnt - pfn_cnt; > +} > + > static unsigned long handle_pg_range(unsigned long pg_start, > unsigned long pg_count) > { > @@ -874,6 +1060,22 @@ static unsigned long handle_pg_range(unsigned long pg_start, > > old_covered_state = has->covered_end_pfn; > > + /* > + * If start_pfn is less than cover_end_pfn, the hot-add memory > + * area is in the gap range. > + */ > + if (start_pfn < has->covered_end_pfn) { > + pgs_ol = handle_hot_add_in_gap(start_pfn, pfn_cnt, has); > + > + pfn_cnt -= pgs_ol; > + if (!pfn_cnt) { > + res = pgs_ol; > + break; > + } > + > + start_pfn += pgs_ol; > + } > + > if (start_pfn < has->ha_end_pfn) { > /* > * This is the case where we are backing pages > @@ -931,6 +1133,23 @@ static unsigned long handle_pg_range(unsigned long pg_start, > return res; > } > > +static void free_allocated_pages(__u64 start_frame, int num_pages) > +{ > + struct page *pg; > + int i; > + > + for (i = 0; i < num_pages; i++) { > + pg = pfn_to_page(i + start_frame); > + > + if (page_private(pfn_to_page(i))) > + set_page_private(pfn_to_page(i), 0); > + > + __ClearPageOffline(pg); > + __free_page(pg); > + dm_device.num_pages_ballooned--; > + } > +} > + > static unsigned long process_hot_add(unsigned long pg_start, > unsigned long pfn_cnt, > unsigned long rg_start, > @@ -940,18 +1159,40 @@ static unsigned long process_hot_add(unsigned long pg_start, > int covered; > unsigned long flags; > > - if (pfn_cnt == 0) > - return 0; > + /* > + * Check whether page is allocated by driver via page private > + * data due to remainder pages. > + */ > + if (present_section_nr(pfn_to_section_nr(pg_start)) > + && page_private(pfn_to_page(pg_start))) { > + free_allocated_pages(pg_start, pfn_cnt); > + return pfn_cnt; > + } > > - if (!dm_device.host_specified_ha_region) { > - covered = pfn_covered(pg_start, pfn_cnt); > - if (covered < 0) > - return 0; > + if ((rg_start == 0) && (!dm_device.host_specified_ha_region)) { > + /* > + * The host has not specified the hot-add region. > + * Based on the hot-add page range being specified, > + * compute a hot-add region that can cover the pages > + * that need to be hot-added while ensuring the alignment > + * and size requirements of Linux as it relates to hot-add. > + */ > + rg_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; > + if (pfn_cnt % HA_CHUNK) > + rg_size += HA_CHUNK; > > - if (covered) > - goto do_pg_range; > + rg_start = (pg_start / HA_CHUNK) * HA_CHUNK; > } > > + if (pfn_cnt == 0) > + return 0; > + > + covered = pfn_covered(pg_start, pfn_cnt); > + if (covered < 0) > + return 0; > + else if (covered) > + goto do_pg_range; > + > /* > * If the host has specified a hot-add range; deal with it first. > */ > @@ -983,8 +1224,321 @@ static unsigned long process_hot_add(unsigned long pg_start, > return handle_pg_range(pg_start, pfn_cnt); > } > > +static int check_memblock_online(struct memory_block *mem, void *arg) > +{ > + if (mem->state != MEM_ONLINE) > + return -1; > + > + return 0; > +} > + > +static int change_memblock_state(struct memory_block *mem, void *arg) > +{ > + unsigned long state = (unsigned long)arg; > + > + mem->state = state; > + > + return 0; > +} > + > +static bool hv_offline_pages(unsigned long start_pfn, unsigned long nr_pages) > +{ > + const unsigned long start = PFN_PHYS(start_pfn); > + const unsigned long size = PFN_PHYS(nr_pages); > + > + lock_device_hotplug(); > + > + if (walk_memory_blocks(start, size, NULL, check_memblock_online)) { > + unlock_device_hotplug(); > + return false; > + } > + > + walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE, > + change_memblock_state); > + > + if (offline_pages(start_pfn, nr_pages)) { > + walk_memory_blocks(start_pfn, nr_pages, (void *)MEM_ONLINE, > + change_memblock_state); > + unlock_device_hotplug(); > + return false; > + } > + > + walk_memory_blocks(start, size, (void *)MEM_OFFLINE, > + change_memblock_state); > + > + unlock_device_hotplug(); > + return true; > +} > + > +static int hv_hot_remove_range(unsigned int nid, unsigned long start_pfn, > + unsigned long end_pfn, unsigned long nr_pages, > + unsigned long *array_index, > + union dm_mem_page_range *range_array, > + struct hv_hotadd_state *has) > +{ > + unsigned long block_pages = HA_CHUNK; > + unsigned long rm_pages = nr_pages; > + unsigned long pfn; > + > + for (pfn = start_pfn; pfn < end_pfn; pfn += block_pages) { > + struct hv_hotadd_gap *gap; > + int in_gaps = 0; > + > + if (*array_index >= MAX_HOT_REMOVE_ENTRIES) { > + struct dm_hot_remove_response *resp = > + (struct dm_hot_remove_response *) > + balloon_up_send_buffer; > + int ret; > + > + /* Flush out all remove response entries. */ > + ret = hv_send_hot_remove_response(resp, *array_index, > + true); > + if (ret) > + return ret; > + > + memset(resp, 0x00, PAGE_SIZE); > + *array_index = 0; > + } > + > + if (has) { > + /* > + * Memory in gaps has been offlined or removed and > + * so skip it if remove range overlap with gap. > + */ > + list_for_each_entry(gap, &has->gap_list, list) > + if (!(pfn >= gap->end_pfn || > + pfn + block_pages < gap->start_pfn)) { > + in_gaps = 1; > + break; > + } > + > + if (in_gaps) > + continue; > + } > + > + if (online_section_nr(pfn_to_section_nr(pfn)) > + && is_mem_section_removable(pfn, block_pages) > + && hv_offline_pages(pfn, block_pages)) { > + remove_memory(nid, pfn << PAGE_SHIFT, > + block_pages << PAGE_SHIFT); > + > + range_array[*array_index].finfo.start_page = pfn; > + range_array[*array_index].finfo.page_cnt = block_pages; > + > + (*array_index)++; > + nr_pages -= block_pages; > + > + if (!nr_pages) > + break; > + } > + } > + > + return rm_pages - nr_pages; > +} > + > +static int hv_hot_remove_from_ha_list(unsigned int nid, unsigned long nr_pages, > + unsigned long *array_index, > + union dm_mem_page_range *range_array) > +{ > + struct hv_hotadd_state *has; > + unsigned long start_pfn, end_pfn; > + unsigned long flags, rm_pages; > + int old_index; > + int ret, i; > + > + spin_lock_irqsave(&dm_device.ha_lock, flags); > + dm_device.lock_thread = current; > + list_for_each_entry(has, &dm_device.ha_region_list, list) { > + start_pfn = has->start_pfn; > + end_pfn = has->covered_end_pfn; > + rm_pages = min(nr_pages, has->covered_end_pfn - has->start_pfn); > + old_index = *array_index; > + > + if (!rm_pages || pfn_to_nid(start_pfn) != nid) > + continue; > + > + rm_pages = hv_hot_remove_range(nid, start_pfn, end_pfn, > + rm_pages, array_index, range_array, has); > + > + if (rm_pages < 0) > + return rm_pages; > + else if (!rm_pages) > + continue; > + > + nr_pages -= rm_pages; > + dm_device.num_pages_added -= rm_pages; > + > + /* Create gaps for hot remove regions. */ > + for (i = old_index; i < *array_index; i++) { > + struct hv_hotadd_gap *gap; > + > + gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); > + if (!gap) { > + ret = -ENOMEM; > + do_hot_add = false; > + return ret; > + } > + > + INIT_LIST_HEAD(&gap->list); > + gap->start_pfn = range_array[i].finfo.start_page; > + gap->end_pfn = > + gap->start_pfn + range_array[i].finfo.page_cnt; > + list_add_tail(&gap->list, &has->gap_list); > + } > + > + if (!nr_pages) > + break; > + } > + dm_device.lock_thread = NULL; > + spin_unlock_irqrestore(&dm_device.ha_lock, flags); > + > + return nr_pages; > +} > + > +static void free_balloon_pages(struct hv_dynmem_device *dm, > + union dm_mem_page_range *range_array) > +{ > + int num_pages = range_array->finfo.page_cnt; > + __u64 start_frame = range_array->finfo.start_page; > + > + free_allocated_pages(start_frame, num_pages); > +} > + > +static int hv_hot_remove_pages(struct dm_hot_remove_response *resp, > + u64 nr_pages, unsigned long *array_index, > + bool more_pages) > +{ > + int i, j, alloc_unit = PAGES_IN_2M; > + struct page *pg; > + int ret; > + > + for (i = 0; i < nr_pages; i += alloc_unit) { > + if (*array_index >= MAX_HOT_REMOVE_ENTRIES) { > + /* Flush out all remove response entries. */ > + ret = hv_send_hot_remove_response(resp, > + *array_index, true); > + if (ret) > + goto free_pages; > + > + /* > + * Continue to allocate memory for hot remove > + * after resetting send buffer and array index. > + */ > + memset(resp, 0x00, PAGE_SIZE); > + *array_index = 0; > + } > +retry: > + pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | > + __GFP_NOMEMALLOC | __GFP_NOWARN, > + get_order(alloc_unit << PAGE_SHIFT)); > + if (!pg) { > + if (alloc_unit == 1) { > + ret = -ENOMEM; > + goto free_pages; > + } > + > + alloc_unit = 1; > + goto retry; > + } > + > + if (alloc_unit != 1) > + split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); > + > + for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); > + j++) { > + __SetPageOffline(pg + j); > + > + /* > + * Set page's private data to non-zero and use it > + * to identify whehter the page is allocated by driver > + * or new hot-add memory in process_hot_add(). > + */ > + set_page_private(pg + j, 1); > + } > + > + resp->range_array[*array_index].finfo.start_page > + = page_to_pfn(pg); > + resp->range_array[*array_index].finfo.page_cnt > + = alloc_unit; > + (*array_index)++; > + > + dm_device.num_pages_ballooned += alloc_unit; > + } > + > + ret = hv_send_hot_remove_response(resp, *array_index, more_pages); > + if (ret) > + goto free_pages; > + > + return 0; > + > +free_pages: > + for (i = 0; i < *array_index; i++) > + free_balloon_pages(&dm_device, &resp->range_array[i]); > + > + /* Response hot remove failure. */ > + hv_send_hot_remove_response(resp, 0, false); > + return ret; > +} > + > +static void hv_hot_remove_mem_from_node(unsigned int nid, u64 nr_pages) > +{ > + struct dm_hot_remove_response *resp > + = (struct dm_hot_remove_response *)balloon_up_send_buffer; > + unsigned long remainder = nr_pages % HA_CHUNK; > + unsigned long start_pfn = node_start_pfn(nid); > + unsigned long end_pfn = node_end_pfn(nid); > + unsigned long array_index = 0; > + int ret; > + > + /* > + * If page number isn't aligned with memory hot plug unit, > + * handle remainder pages via balloon way. > + */ > + if (remainder) { > + memset(resp, 0x00, PAGE_SIZE); > + ret = hv_hot_remove_pages(resp, remainder, &array_index, > + !!(nr_pages - remainder)); > + if (ret) > + return; > + > + nr_pages -= remainder; > + if (!nr_pages) > + return; > + } > + > + memset(resp, 0x00, PAGE_SIZE); > + array_index = 0; > + nr_pages = hv_hot_remove_from_ha_list(nid, nr_pages, &array_index, > + resp->range_array); > + if (nr_pages < 0) { > + /* Set array_index to 0 and response failure in resposne msg. */ > + array_index = 0; > + } else if (nr_pages) { > + start_pfn = ALIGN(start_pfn, HA_CHUNK); > + hv_hot_remove_range(nid, start_pfn, end_pfn, nr_pages, > + &array_index, resp->range_array, NULL); > + } > + > + hv_send_hot_remove_response(resp, array_index, false); > +} > + > #endif > > +static void hot_remove_req(struct work_struct *dummy) > +{ > + struct hv_dynmem_device *dm = &dm_device; > + unsigned int numa_node = dm->hr_wrk.virtual_node; > + unsigned int page_count = dm->hr_wrk.page_count; > + > + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) || do_hot_add) > + hv_hot_remove_mem_from_node(numa_node, page_count); > + else > + hv_send_hot_remove_response((struct dm_hot_remove_response *) > + balloon_up_send_buffer, 0, false); > + > + dm->state = DM_INITIALIZED; > +} > + > static void hot_add_req(struct work_struct *dummy) > { > struct dm_hot_add_response resp; > @@ -1005,28 +1559,6 @@ static void hot_add_req(struct work_struct *dummy) > rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; > rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; > > - if ((rg_start == 0) && (!dm->host_specified_ha_region)) { > - unsigned long region_size; > - unsigned long region_start; > - > - /* > - * The host has not specified the hot-add region. > - * Based on the hot-add page range being specified, > - * compute a hot-add region that can cover the pages > - * that need to be hot-added while ensuring the alignment > - * and size requirements of Linux as it relates to hot-add. > - */ > - region_start = pg_start; > - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; > - if (pfn_cnt % HA_CHUNK) > - region_size += HA_CHUNK; > - > - region_start = (pg_start / HA_CHUNK) * HA_CHUNK; > - > - rg_start = region_start; > - rg_sz = region_size; > - } > - > if (do_hot_add) > resp.page_count = process_hot_add(pg_start, pfn_cnt, > rg_start, rg_sz); > @@ -1190,24 +1722,6 @@ static void post_status(struct hv_dynmem_device *dm) > > } > > -static void free_balloon_pages(struct hv_dynmem_device *dm, > - union dm_mem_page_range *range_array) > -{ > - int num_pages = range_array->finfo.page_cnt; > - __u64 start_frame = range_array->finfo.start_page; > - struct page *pg; > - int i; > - > - for (i = 0; i < num_pages; i++) { > - pg = pfn_to_page(i + start_frame); > - __ClearPageOffline(pg); > - __free_page(pg); > - dm->num_pages_ballooned--; > - } > -} > - > - > - > static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, > unsigned int num_pages, > struct dm_balloon_response *bl_resp, > @@ -1354,22 +1868,38 @@ static void balloon_up(struct work_struct *dummy) > > } > > -static void balloon_down(struct hv_dynmem_device *dm, > - struct dm_unballoon_request *req) > +static void balloon_down(struct work_struct *dummy) > { > + struct dm_unballoon_request *req = > + (struct dm_unballoon_request *)recv_buffer; > union dm_mem_page_range *range_array = req->range_array; > int range_count = req->range_count; > struct dm_unballoon_response resp; > - int i; > + struct hv_dynmem_device *dm = &dm_device; > unsigned int prev_pages_ballooned = dm->num_pages_ballooned; > + int i; > > for (i = 0; i < range_count; i++) { > - free_balloon_pages(dm, &range_array[i]); > - complete(&dm_device.config_event); > + /* > + * Hyper-V has a bug of sending unballoon msg instead > + * of hot add msg when there is no balloon msg sent before > + * Do hot add operation for all unballoon msg If hot add > + * capability is enabled, > + */ > + if (do_hot_add) { > + dm->host_specified_ha_region = false; > + dm->num_pages_added += > + process_hot_add(range_array[i].finfo.start_page, > + range_array[i].finfo.page_cnt, 0, 0); > + } else { > + free_balloon_pages(dm, &range_array[i]); > + } > } > + complete(&dm_device.config_event); > > - pr_debug("Freed %u ballooned pages.\n", > - prev_pages_ballooned - dm->num_pages_ballooned); > + if (!do_hot_add) > + pr_debug("Freed %u ballooned pages.\n", > + prev_pages_ballooned - dm->num_pages_ballooned); > > if (req->more_pages == 1) > return; > @@ -1489,6 +2019,7 @@ static void balloon_onchannelcallback(void *context) > struct hv_dynmem_device *dm = hv_get_drvdata(dev); > struct dm_balloon *bal_msg; > struct dm_hot_add *ha_msg; > + struct dm_hot_remove *hr_msg; > union dm_mem_page_range *ha_pg_range; > union dm_mem_page_range *ha_region; > > @@ -1522,8 +2053,7 @@ static void balloon_onchannelcallback(void *context) > > case DM_UNBALLOON_REQUEST: > dm->state = DM_BALLOON_DOWN; > - balloon_down(dm, > - (struct dm_unballoon_request *)recv_buffer); > + schedule_work(&dm_device.unballoon_wrk.wrk); > break; > > case DM_MEM_HOT_ADD_REQUEST: > @@ -1554,6 +2084,19 @@ static void balloon_onchannelcallback(void *context) > } > schedule_work(&dm_device.ha_wrk.wrk); > break; > + case DM_MEM_HOT_REMOVE_REQUEST: > + if (dm->state == DM_HOT_REMOVE) > + pr_warn("Currently hot-removing.\n"); > + > + dm->state = DM_HOT_REMOVE; > + hr_msg = (struct dm_hot_remove *)recv_buffer; > + > + dm->hr_wrk.virtual_node = hr_msg->virtual_node; > + dm->hr_wrk.page_count = hr_msg->page_count; > + dm->hr_wrk.qos_flags = hr_msg->qos_flags; > + > + schedule_work(&dm_device.hr_wrk.wrk); > + break; > > case DM_INFO_MESSAGE: > process_info(dm, (struct dm_info_msg *)dm_msg); > @@ -1628,6 +2171,7 @@ static int balloon_connect_vsp(struct hv_device *dev) > > cap_msg.caps.cap_bits.balloon = 1; > cap_msg.caps.cap_bits.hot_add = 1; > + cap_msg.caps.cap_bits.hot_remove = 1; > > /* > * Specify our alignment requirements as it relates > @@ -1688,7 +2232,9 @@ static int balloon_probe(struct hv_device *dev, > INIT_LIST_HEAD(&dm_device.ha_region_list); > spin_lock_init(&dm_device.ha_lock); > INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); > + INIT_WORK(&dm_device.unballoon_wrk.wrk, balloon_down); > INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); > + INIT_WORK(&dm_device.hr_wrk.wrk, hot_remove_req); > dm_device.host_specified_ha_region = false; > > #ifdef CONFIG_MEMORY_HOTPLUG -- Vitaly