From: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx> Linux system mem hot plug unit is 128MB and request page number maybe not aligned with unit. The non-aligned case will handle in the later. Handle mem hot-remve request: First, search memory from ha region list. If find suitable memory block, offline & remove memory and create ha region region "gap" struct for the range. "gap" means the range in the hot-add region is offlined or removed. The following mem hot-add msg may add memory in the gap range back. If there is no suitable memory in the hot-add region, search memory from the system memory on the target node and perform offline&remove memory. Signed-off-by: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx> --- drivers/hv/hv_balloon.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 184 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 43e490f492d5..3d8c09fe148a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -56,6 +56,10 @@ #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) +#define MAX_HOT_REMOVE_ENTRIES \ + ((PAGE_SIZE - sizeof(struct dm_hot_remove_response)) \ + / sizeof(union dm_mem_page_range)) + enum { DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), @@ -697,6 +701,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, { struct memory_notify *mem = (struct memory_notify *)v; unsigned long pfn_count; + int need_unlock; switch (val) { case MEM_ONLINE: @@ -708,7 +713,11 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, break; case MEM_OFFLINE: - mutex_lock(&dm_device.ha_lock); + if (dm_device.lock_thread != current) { + mutex_lock(&dm_device.ha_lock); + need_unlock = 1; + } + pfn_count = hv_page_offline_check(mem->start_pfn, mem->nr_pages); if (pfn_count <= dm_device.num_pages_onlined) { @@ -722,7 +731,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, WARN_ON_ONCE(1); dm_device.num_pages_onlined = 0; } - mutex_unlock(&dm_device.ha_lock); + + if (need_unlock) + mutex_unlock(&dm_device.ha_lock); break; case MEM_GOING_ONLINE: case MEM_GOING_OFFLINE: @@ -1046,14 +1057,183 @@ static unsigned long process_hot_add(unsigned long pg_start, return handle_pg_range(pg_start, pfn_cnt); } +static int hv_hot_remove_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, + unsigned long *request_index, + union dm_mem_page_range *range_array, + struct hv_hotadd_state *has) +{ + unsigned long block_pages = HA_CHUNK; + unsigned long rm_pages = nr_pages; + unsigned long pfn; + int ret; + + for (pfn = start_pfn; pfn < end_pfn; pfn += block_pages) { + struct hv_hotadd_gap *gap; + int in_gap; + + if (*request_index >= MAX_HOT_REMOVE_ENTRIES) { + struct dm_hot_remove_response *resp = + (struct dm_hot_remove_response *) + balloon_up_send_buffer; + + /* Flush out all hot-remove ranges. */ + ret = hv_send_hot_remove_response(resp, *request_index, + true); + if (ret) + return ret; + + /* Reset request buffer. */ + memset(resp, 0x00, PAGE_SIZE); + *request_index = 0; + } + + /* + * Memory in hot-add region gaps has been offlined or removed + * and so skip it if remove range overlap with gap. + */ + if (has) { + list_for_each_entry(gap, &has->gap_list, list) + if (!(pfn >= gap->end_pfn || + pfn + block_pages < gap->start_pfn)) { + in_gap = 1; + break; + } + + if (in_gap) + continue; + } + + if (online_section_nr(pfn_to_section_nr(pfn)) + && is_mem_section_removable(pfn, block_pages)) { + ret = offline_and_remove_memory(nid, pfn << PAGE_SHIFT, + block_pages << PAGE_SHIFT); + if (ret) + continue; + + range_array[*request_index].finfo.start_page = pfn; + range_array[*request_index].finfo.page_cnt + = block_pages; + + (*request_index)++; + nr_pages -= block_pages; + + if (!nr_pages) + break; + } + } + + return rm_pages - nr_pages; +} + +static int hv_hot_remove_from_ha_list(unsigned int nid, unsigned long nr_pages, + unsigned long *request_index, + union dm_mem_page_range *range_array) +{ + struct hv_hotadd_state *has; + unsigned long start_pfn, end_pfn; + int rm_pages; + int old_index; + int ret, i; + + mutex_lock(&dm_device.ha_lock); + dm_device.lock_thread = current; + list_for_each_entry(has, &dm_device.ha_region_list, list) { + rm_pages = min(nr_pages, + has->covered_end_pfn - has->start_pfn); + start_pfn = has->start_pfn; + end_pfn = has->covered_end_pfn; + old_index = *request_index; + + if (!rm_pages || pfn_to_nid(start_pfn) != nid) + continue; + + rm_pages = hv_hot_remove_range(nid, start_pfn, end_pfn, + rm_pages, request_index, range_array, has); + if (rm_pages < 0) { + ret = rm_pages; + goto error; + } else if (!rm_pages) { + continue; + } + + nr_pages -= rm_pages; + dm_device.num_pages_added -= rm_pages; + + /* Create gaps for hot remove regions. */ + for (i = old_index; i < *request_index; i++) { + struct hv_hotadd_gap *gap; + + gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); + if (!gap) { + /* + * Disable dm hot-plug when fails to allocate + * memory for gaps. + */ + ret = -ENOMEM; + do_hot_add = false; + goto error; + } + + INIT_LIST_HEAD(&gap->list); + gap->start_pfn = range_array[i].finfo.start_page; + gap->end_pfn = + gap->start_pfn + range_array[i].finfo.page_cnt; + list_add_tail(&gap->list, &has->gap_list); + } + + if (!nr_pages) + break; + } + + ret = nr_pages; + error: + dm_device.lock_thread = NULL; + mutex_unlock(&dm_device.ha_lock); + + return ret; +} + +static void hv_mem_hot_remove(unsigned int nid, u64 nr_pages) +{ + struct dm_hot_remove_response *resp + = (struct dm_hot_remove_response *)balloon_up_send_buffer; + unsigned long start_pfn = node_start_pfn(nid); + unsigned long end_pfn = node_end_pfn(nid); + unsigned long request_index = 0; + int remain_pages; + + /* Todo: Handle request of non-aligned page number later. */ + + /* Search hot-remove memory region from hot add list first.*/ + memset(resp, 0x00, PAGE_SIZE); + remain_pages = hv_hot_remove_from_ha_list(nid, nr_pages, + &request_index, + resp->range_array); + if (remain_pages < 0) { + /* Send failure response msg. */ + request_index = 0; + } else if (remain_pages) { + start_pfn = ALIGN(start_pfn, HA_CHUNK); + hv_hot_remove_range(nid, start_pfn, end_pfn, remain_pages, + &request_index, resp->range_array, NULL); + } + + hv_send_hot_remove_response(resp, request_index, false); +} + #endif static void hot_remove_req(union dm_msg_info *msg_info) { struct hv_dynmem_device *dm = &dm_device; + unsigned int numa_node = msg_info->hot_remove.virtual_node; + unsigned int page_count = msg_info->hot_remove.page_count; - /* Add hot remove operation later and send failure response. */ - hv_send_hot_remove_response((struct dm_hot_remove_response *) + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && do_hot_add) + hv_mem_hot_remove(numa_node, page_count); + else + hv_send_hot_remove_response((struct dm_hot_remove_response *) balloon_up_send_buffer, 0, false); dm->state = DM_INITIALIZED; -- 2.14.5