Quoting from commit 7527810573436f ("Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type"), "[...] Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL messages with the promise that (after the ACK is sent) the channel won't send any more interrupts to the "old" CPU. The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is problematic if the user want to take a CPU offline, since we don't want to take a CPU offline (and, potentially, "lose" channel interrupts on such CPU) if the host is still processing a CHANNELMSG_MODIFYCHANNEL message associated to that CPU." Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE(24) message type, which embodies the type of the CHANNELMSG_MODIFYCHANNEL ACK. Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@xxxxxxxxx> --- drivers/hv/channel.c | 108 ++++++++++++++++++++++++++++++++------ drivers/hv/channel_mgmt.c | 42 +++++++++++++++ drivers/hv/connection.c | 3 +- drivers/hv/hv.c | 52 ++++++++++++++++++ drivers/hv/hv_trace.h | 15 ++++++ drivers/hv/vmbus_drv.c | 4 +- include/linux/hyperv.h | 13 ++++- 7 files changed, 218 insertions(+), 19 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fbdda9938039a..6801d89a20051 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -209,31 +209,107 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, } EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); +static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp) +{ + struct vmbus_channel_modifychannel msg; + int ret; + + memset(&msg, 0, sizeof(msg)); + msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + msg.child_relid = channel->offermsg.child_relid; + msg.target_vp = target_vp; + + ret = vmbus_post_msg(&msg, sizeof(msg), true); + trace_vmbus_send_modifychannel(&msg, ret); + + return ret; +} + +static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp) +{ + struct vmbus_channel_modifychannel *msg; + struct vmbus_channel_msginfo *info; + unsigned long flags; + int ret; + + info = kzalloc(sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_modifychannel), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + init_completion(&info->waitevent); + info->waiting_channel = channel; + + msg = (struct vmbus_channel_modifychannel *)info->msg; + msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL; + msg->child_relid = channel->offermsg.child_relid; + msg->target_vp = target_vp; + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + if (channel->rescind) { + ret = -ENODEV; + goto free_info; + } + + ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_modifychannel), true); + trace_vmbus_send_modifychannel(msg, ret); + if (ret != 0) + goto clean_msglist; + + /* + * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on + * the mutex and be unable to signal the completion. + */ + mutex_unlock(&vmbus_connection.channel_mutex); + wait_for_completion(&info->waitevent); + mutex_lock(&vmbus_connection.channel_mutex); + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + if (channel->rescind) { + ret = -ENODEV; + goto free_info; + } + + if (info->response.modify_response.status) { + kfree(info); + return -EAGAIN; + } + + kfree(info); + return 0; + +clean_msglist: + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +free_info: + kfree(info); + return ret; +} + /* * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. * * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not - * ACK such messages. IOW we can't know when the host will stop interrupting - * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * ACK such messages before VERSION_WIN10_V5_3. Without ACK, we can not know + * when the host will stop interrupting the "old" vCPU and start interrupting + * the "new" vCPU for the given channel. * * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version * VERSION_WIN10_V4_1. */ -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp) { - struct vmbus_channel_modifychannel conn_msg; - int ret; - - memset(&conn_msg, 0, sizeof(conn_msg)); - conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; - conn_msg.child_relid = child_relid; - conn_msg.target_vp = target_vp; - - ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); - - trace_vmbus_send_modifychannel(&conn_msg, ret); - - return ret; + if (vmbus_proto_version >= VERSION_WIN10_V5_3) + return send_modifychannel_with_ack(channel, target_vp); + return send_modifychannel_without_ack(channel, target_vp); } EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 1d44bb635bb84..8fcb66d623ba4 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1200,6 +1200,46 @@ static void vmbus_onopen_result(struct vmbus_channel_message_header *hdr) spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); } +/* + * vmbus_onmodifychannel_response - Modify Channel response handler. + * + * This is invoked when we received a response to our channel modify request. + * Find the matching request, copy the response and signal the requesting thread. + */ +static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr) +{ + struct vmbus_channel_modifychannel_response *response; + struct vmbus_channel_msginfo *msginfo; + unsigned long flags; + + response = (struct vmbus_channel_modifychannel_response *)hdr; + + trace_vmbus_onmodifychannel_response(response); + + /* + * Find the modify msg, copy the response and signal/unblock the wait event. + */ + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + + list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) { + struct vmbus_channel_message_header *responseheader = + (struct vmbus_channel_message_header *)msginfo->msg; + + if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) { + struct vmbus_channel_modifychannel *modifymsg; + + modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg; + if (modifymsg->child_relid == response->child_relid) { + memcpy(&msginfo->response.modify_response, response, + sizeof(struct vmbus_channel_modifychannel_response)); + complete(&msginfo->waitevent); + break; + } + } + } + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + /* * vmbus_ongpadl_created - GPADL created handler. * @@ -1366,6 +1406,8 @@ channel_message_table[CHANNELMSG_COUNT] = { { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL_RESPONSE, 1, vmbus_onmodifychannel_response, + sizeof(struct vmbus_channel_modifychannel_response)}, }; /* diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 11170d9a2e1a5..cdf41c504d914 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -45,6 +45,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Table of VMBus versions listed from newest to oldest. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, VERSION_WIN10_V5, @@ -60,7 +61,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_2; +static uint max_version = VERSION_WIN10_V5_3; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 0cde10fe0e71f..35f240f4c833e 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -16,6 +16,7 @@ #include <linux/version.h> #include <linux/random.h> #include <linux/clockchips.h> +#include <linux/delay.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -237,6 +238,40 @@ void hv_synic_disable_regs(unsigned int cpu) hv_set_synic_state(sctrl.as_uint64); } +#define HV_MAX_TRIES 3 +/* + * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one + * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times. + * Return 'true', if there is still any set bit after this operation; 'false', otherwise. + */ +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *event = + (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; + unsigned long *recv_int_page = event->flags; + bool pending; + u32 relid; + int tries = 0; + +retry: + pending = false; + for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { + /* Special case - VMBus channel protocol messages */ + if (relid == 0) + continue; + if (sync_test_bit(relid, recv_int_page)) { + pending = true; + break; + } + } + if (pending && tries++ < HV_MAX_TRIES) { + usleep_range(10000, 20000); + goto retry; + } + return pending; +} + int hv_synic_cleanup(unsigned int cpu) { struct vmbus_channel *channel, *sc; @@ -276,6 +311,23 @@ int hv_synic_cleanup(unsigned int cpu) if (channel_found && vmbus_connection.conn_state == CONNECTED) return -EBUSY; + if (vmbus_proto_version >= VERSION_WIN10_V5_3) { + /* + * channel_found == false means that any channels that were previously + * assigned to the CPU have been reassigned elsewhere. Since we have + * received a ModifyChannel ACK from Hyper-V for all such reassignments, + * we know that Hyper-V won't set any new bits in the event flags page. + * However, there may be existing bits set in this page that have not + * been processed by vmbus_chan_sched(). We scan the event flags page + * looking for any bits that are set and waiting (with a timeout) for + * vmbus_chan_sched() to process such bits. If bits are still set after + * this operation (and VMBus is connected), we fail the CPU offlining + * operation. + */ + if (hv_synic_event_pending() && vmbus_connection.conn_state == CONNECTED) + return -EBUSY; + } + hv_stimer_legacy_cleanup(cpu); hv_synic_disable_regs(cpu); diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index 6063bb21bb137..3e83c24856dbe 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -86,6 +86,21 @@ TRACE_EVENT(vmbus_onopen_result, ) ); +TRACE_EVENT(vmbus_onmodifychannel_response, + TP_PROTO(const struct vmbus_channel_modifychannel_response *response), + TP_ARGS(response), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, status) + ), + TP_fast_assign(__entry->child_relid = response->child_relid; + __entry->status = response->status; + ), + TP_printk("child_relid 0x%x, status %d", + __entry->child_relid, __entry->status + ) + ); + TRACE_EVENT(vmbus_ongpadl_created, TP_PROTO(const struct vmbus_channel_gpadl_created *gpadlcreated), TP_ARGS(gpadlcreated), diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 4fad3e6745e53..3e1cd5e8f769e 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1767,13 +1767,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, if (target_cpu == origin_cpu) goto cpu_store_unlock; - if (vmbus_send_modifychannel(channel->offermsg.child_relid, + if (vmbus_send_modifychannel(channel, hv_cpu_number_to_vp_number(target_cpu))) { ret = -EIO; goto cpu_store_unlock; } /* + * For version before VERSION_WIN10_V5_3, the following warning holds: + * * Warning. At this point, there is *no* guarantee that the host will * have successfully processed the vmbus_send_modifychannel() request. * See the header comment of vmbus_send_modifychannel() for more info. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 1ce131f29f3b4..808acf4c3fe61 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent( * 5 . 0 (Newer Windows 10) * 5 . 1 (Windows 10 RS4) * 5 . 2 (Windows Server 2019, RS5) + * 5 . 3 (Windows Server 2021) // FIXME: use proper version number/name */ #define VERSION_WS2008 ((0 << 16) | (13)) @@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent( #define VERSION_WIN10_V5 ((5 << 16) | (0)) #define VERSION_WIN10_V5_1 ((5 << 16) | (1)) #define VERSION_WIN10_V5_2 ((5 << 16) | (2)) +#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -475,6 +477,7 @@ enum vmbus_channel_message_type { CHANNELMSG_TL_CONNECT_REQUEST = 21, CHANNELMSG_MODIFYCHANNEL = 22, CHANNELMSG_TL_CONNECT_RESULT = 23, + CHANNELMSG_MODIFYCHANNEL_RESPONSE = 24, CHANNELMSG_COUNT }; @@ -588,6 +591,13 @@ struct vmbus_channel_open_result { u32 status; } __packed; +/* Modify Channel Result parameters */ +struct vmbus_channel_modifychannel_response { + struct vmbus_channel_message_header header; + u32 child_relid; + u32 status; +} __packed; + /* Close channel parameters; */ struct vmbus_channel_close_channel { struct vmbus_channel_message_header header; @@ -720,6 +730,7 @@ struct vmbus_channel_msginfo { struct vmbus_channel_gpadl_torndown gpadl_torndown; struct vmbus_channel_gpadl_created gpadl_created; struct vmbus_channel_version_response version_response; + struct vmbus_channel_modifychannel_response modify_response; } response; u32 msgsize; @@ -1562,7 +1573,7 @@ extern __u32 vmbus_proto_version; int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp); +int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); /* Get the start of the ring buffer. */ -- 2.25.1