> Subject: RE: [Patch v3 2/3] Drivers: hv: add Azure Blob driver > > From: longli@xxxxxxxxxxxxxxxxx <longli@xxxxxxxxxxxxxxxxx> Sent: > Tuesday, July 13, 2021 7:45 PM > > > > Azure Blob storage provides scalable and durable data storage for Azure. > > > (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu > > re.microsoft.com%2Fen- > us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7 > > > C01%7Clongli%40microsoft.com%7C852590ffe972466d24b308d948723d8c%7C > 72f9 > > > 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637620477769866945%7CUnk > nown%7C > > > TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL > CJXVC > > > I6Mn0%3D%7C1000&sdata=PZYfSVoLanlTv%2Fi%2Bks4a1IMM0cMiRxP2 > poypgs20 > > S7c%3D&reserved=0) > > > > This driver adds support for accelerated access to Azure Blob storage. > > As an alternative to REST APIs, it provides a fast data path that uses > > host native network stack and secure direct data link for storage server > access. > > > > This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL. > > > > Cc: Jonathan Corbet <corbet@xxxxxxx> > > Cc: "K. Y. Srinivasan" <kys@xxxxxxxxxxxxx> > > Cc: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> > > Cc: Stephen Hemminger <sthemmin@xxxxxxxxxxxxx> > > Cc: Wei Liu <wei.liu@xxxxxxxxxx> > > Cc: Dexuan Cui <decui@xxxxxxxxxxxxx> > > Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> > > Cc: Bjorn Andersson <bjorn.andersson@xxxxxxxxxx> > > Cc: Hans de Goede <hdegoede@xxxxxxxxxx> > > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > > Cc: Maximilian Luz <luzmaximilian@xxxxxxxxx> > > Cc: Mike Rapoport <rppt@xxxxxxxxxx> > > Cc: Ben Widawsky <ben.widawsky@xxxxxxxxx> > > Cc: Jiri Slaby <jirislaby@xxxxxxxxxx> > > Cc: Andra Paraschiv <andraprs@xxxxxxxxxx> > > Cc: Siddharth Gupta <sidgup@xxxxxxxxxxxxxx> > > Cc: Hannes Reinecke <hare@xxxxxxx> > > Cc: linux-doc@xxxxxxxxxxxxxxx > > Signed-off-by: Long Li <longli@xxxxxxxxxxxxx> > > --- > > Documentation/userspace-api/ioctl/ioctl-number.rst | 2 + > > drivers/hv/Kconfig | 10 + > > drivers/hv/Makefile | 1 + > > drivers/hv/azure_blob.c | 625 +++++++++++++++++++++ > > drivers/hv/channel_mgmt.c | 7 + > > include/linux/hyperv.h | 9 + > > include/uapi/misc/azure_blob.h | 34 ++ > > 7 files changed, 688 insertions(+) > > create mode 100644 drivers/hv/azure_blob.c create mode 100644 > > include/uapi/misc/azure_blob.h > > > > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst > > b/Documentation/userspace-api/ioctl/ioctl-number.rst > > index 9bfc2b5..d3c2a90 100644 > > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst > > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst > > @@ -180,6 +180,8 @@ Code Seq# Include File > Comments > > 'R' 01 linux/rfkill.h conflict! > > 'R' C0-DF net/bluetooth/rfcomm.h > > 'R' E0 uapi/linux/fsl_mc.h > > +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob > driver > > + > > +<mailto:longli@xxxxxxxxxxxxx> > > 'S' all linux/cdrom.h conflict! > > 'S' 80-81 scsi/scsi_ioctl.h conflict! > > 'S' 82-FF scsi/scsi.h conflict! > > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index > > 66c794d..e08b8d3 100644 > > --- a/drivers/hv/Kconfig > > +++ b/drivers/hv/Kconfig > > @@ -27,4 +27,14 @@ config HYPERV_BALLOON > > help > > Select this option to enable Hyper-V Balloon driver. > > > > +config HYPERV_AZURE_BLOB > > + tristate "Microsoft Azure Blob driver" > > + depends on HYPERV && X86_64 > > + help > > + Select this option to enable Microsoft Azure Blob driver. > > + > > + This driver supports accelerated Microsoft Azure Blob access. > > + To compile this driver as a module, choose M here. The module will > be > > + called azure_blob. > > + > > endmenu > > diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index > > 94daf82..a322575 100644 > > --- a/drivers/hv/Makefile > > +++ b/drivers/hv/Makefile > > @@ -2,6 +2,7 @@ > > obj-$(CONFIG_HYPERV) += hv_vmbus.o > > obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o > > obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o > > +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o > > > > CFLAGS_hv_trace.o = -I$(src) > > CFLAGS_hv_balloon.o = -I$(src) > > diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c new > > file mode 100644 index 0000000..5367d5e > > --- /dev/null > > +++ b/drivers/hv/azure_blob.c > > @@ -0,0 +1,625 @@ > > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH > > +Linux-syscall-note > > +/* Copyright (c) Microsoft Corporation. */ > > + > > +#include <uapi/misc/azure_blob.h> > > +#include <linux/module.h> > > +#include <linux/device.h> > > +#include <linux/slab.h> > > +#include <linux/cred.h> > > +#include <linux/debugfs.h> > > +#include <linux/pagemap.h> > > +#include <linux/hyperv.h> > > +#include <linux/miscdevice.h> > > +#include <linux/uio.h> > > + > > +struct az_blob_device { > > + struct hv_device *device; > > + > > + /* Opened files maintained by this device */ > > + struct list_head file_list; > > + /* Lock for protecting file_list */ > > + spinlock_t file_lock; > > + wait_queue_head_t file_wait; > > + > > + bool removing; > > +}; > > + > > +/* VSP messages */ > > +enum az_blob_vsp_request_type { > > + AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100, > > + AZ_BLOB_DRIVER_USER_REQUEST = 0x100, > > + AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101, > > + AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102, }; > > + > > +/* VSC->VSP request */ > > +struct az_blob_vsp_request { > > + u32 version; > > + u32 timeout_ms; > > + u32 data_buffer_offset; > > + u32 data_buffer_length; > > + u32 data_buffer_valid; > > + u32 operation_type; > > + u32 request_buffer_offset; > > + u32 request_buffer_length; > > + u32 response_buffer_offset; > > + u32 response_buffer_length; > > + guid_t transaction_id; > > +} __packed; > > + > > +/* VSP->VSC response */ > > +struct az_blob_vsp_response { > > + u32 length; > > + u32 error; > > + u32 response_len; > > +} __packed; > > + > > +struct az_blob_vsp_request_ctx { > > + struct list_head list; > > + struct completion wait_vsp; > > + struct az_blob_request_sync *request; }; > > + > > +struct az_blob_file_ctx { > > + struct list_head list; > > + > > + /* List of pending requests to VSP */ > > + struct list_head vsp_pending_requests; > > + /* Lock for protecting vsp_pending_requests */ > > + spinlock_t vsp_pending_lock; > > + wait_queue_head_t wait_vsp_pending; > > +}; > > + > > +/* The maximum number of pages we can pass to VSP in a single packet > > +*/ #define AZ_BLOB_MAX_PAGES 8192 > > + > > +/* The one and only one */ > > +static struct az_blob_device az_blob_dev; > > + > > +/* Ring buffer size in bytes */ > > +#define AZ_BLOB_RING_SIZE (128 * 1024) > > + > > +/* System wide device queue depth */ > > +#define AZ_BLOB_QUEUE_DEPTH 1024 > > + > > +static const struct hv_vmbus_device_id id_table[] = { > > + { HV_AZURE_BLOB_GUID, > > + .driver_data = 0 > > + }, > > + { }, > > +}; > > + > > +#define az_blob_dbg(fmt, args...) > > +dev_dbg(&az_blob_dev.device->device, fmt, ##args) #define > > +az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt, > > +##args) #define az_blob_warn(fmt, args...) > > +dev_warn(&az_blob_dev.device->device, fmt, ##args) #define > > +az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt, > > +##args) > > + > > +static void az_blob_on_channel_callback(void *context) { > > + struct vmbus_channel *channel = (struct vmbus_channel *)context; > > + const struct vmpacket_descriptor *desc; > > + > > + az_blob_dbg("entering interrupt from vmbus\n"); > > + foreach_vmbus_pkt(desc, channel) { > > + struct az_blob_vsp_request_ctx *request_ctx; > > + struct az_blob_vsp_response *response; > > + u64 cmd_rqst = vmbus_request_addr(&channel->requestor, > > + desc->trans_id); > > + if (cmd_rqst == VMBUS_RQST_ERROR) { > > + az_blob_err("incorrect transaction id %llu\n", > > + desc->trans_id); > > + continue; > > + } > > + request_ctx = (struct az_blob_vsp_request_ctx *)cmd_rqst; > > + response = hv_pkt_data(desc); > > + > > + az_blob_dbg("got response for request %pUb status %u " > > + "response_len %u\n", > > + &request_ctx->request->guid, response->error, > > + response->response_len); > > + request_ctx->request->response.status = response->error; > > + request_ctx->request->response.response_len = > > + response->response_len; > > + complete(&request_ctx->wait_vsp); > > + } > > +} > > + > > +static int az_blob_fop_open(struct inode *inode, struct file *file) { > > + struct az_blob_file_ctx *file_ctx; > > + unsigned long flags; > > + > > + file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL); > > + if (!file_ctx) > > + return -ENOMEM; > > + > > + rcu_read_lock(); > > + > > + if (az_blob_dev.removing) { > > + rcu_read_unlock(); > > + kfree(file_ctx); > > + return -ENODEV; > > + } > > + > > + INIT_LIST_HEAD(&file_ctx->vsp_pending_requests); > > + init_waitqueue_head(&file_ctx->wait_vsp_pending); > > + spin_lock_init(&file_ctx->vsp_pending_lock); > > + file->private_data = file_ctx; > > + > > + spin_lock_irqsave(&az_blob_dev.file_lock, flags); > > + list_add_tail(&file_ctx->list, &az_blob_dev.file_list); > > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags); > > I would think that this function is never called with interrupts disabled, so the > simpler spin_lock()/spin_unlock() functions could be used. > > > + > > + rcu_read_unlock(); > > + > > + return 0; > > +} > > + > > +static int az_blob_fop_release(struct inode *inode, struct file > > +*file) { > > + struct az_blob_file_ctx *file_ctx = file->private_data; > > + unsigned long flags; > > + > > + wait_event(file_ctx->wait_vsp_pending, > > + list_empty(&file_ctx->vsp_pending_requests)); > > + > > + spin_lock_irqsave(&az_blob_dev.file_lock, flags); > > + list_del(&file_ctx->list); > > + if (list_empty(&az_blob_dev.file_list)) > > + wake_up(&az_blob_dev.file_wait); > > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags); > > I would think that this function is never called with interrupts disabled, so the > simpler spin_lock()/spin_unlock() functions could be used. > > > + > > + kfree(file_ctx); > > + > > + return 0; > > +} > > + > > +static inline bool az_blob_safe_file_access(struct file *file) { > > + return file->f_cred == current_cred() && !uaccess_kernel(); } > > + > > +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len, > > + struct page ***ppages, size_t *start, > > + size_t *num_pages) > > +{ > > + struct iovec iov; > > + struct iov_iter iter; > > + int ret; > > + ssize_t result; > > + struct page **pages; > > + > > + ret = import_single_range(rw, buffer, buffer_len, &iov, &iter); > > + if (ret) { > > + az_blob_dbg("request buffer access error %d\n", ret); > > + return ret; > > + } > > + az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n", > > + iter.type, iter.iov_offset, iter.count, iter.nr_segs); > > + > > + result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start); > > + if (result < 0) { > > + az_blob_dbg("failed to pin user pages result=%ld\n", result); > > + return result; > > + } > > + if (result != buffer_len) { > > + az_blob_dbg("can't pin user pages requested %d got %ld\n", > > + buffer_len, result); > > + return -EFAULT; > > + } > > + > > + *ppages = pages; > > + *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE; > > + return 0; > > +} > > + > > +static void fill_in_page_buffer(u64 *pfn_array, int *index, > > + struct page **pages, unsigned long > num_pages) { > > + int i, page_idx = *index; > > + > > + for (i = 0; i < num_pages; i++) > > + pfn_array[page_idx++] = page_to_pfn(pages[i]); > > + *index = page_idx; > > +} > > + > > +static void free_buffer_pages(size_t num_pages, struct page **pages) > > +{ > > + unsigned long i; > > + > > + for (i = 0; i < num_pages; i++) > > + if (pages[i]) > > + put_page(pages[i]); > > + kvfree(pages); > > +} > > + > > +static long az_blob_ioctl_user_request(struct file *filp, unsigned > > +long arg) { > > + struct az_blob_device *dev = &az_blob_dev; > > + struct az_blob_file_ctx *file_ctx = filp->private_data; > > + struct az_blob_request_sync __user *request_user = > > + (struct az_blob_request_sync __user *)arg; > > + struct az_blob_request_sync request; > > + struct az_blob_vsp_request_ctx request_ctx; > > + unsigned long flags; > > + int ret; > > + size_t request_start, request_num_pages = 0; > > + size_t response_start, response_num_pages = 0; > > + size_t data_start, data_num_pages = 0, total_num_pages; > > + struct page **request_pages = NULL, **response_pages = NULL; > > + struct page **data_pages = NULL; > > + struct vmbus_packet_mpb_array *desc; > > + u64 *pfn_array; > > + int desc_size; > > + int page_idx; > > + struct az_blob_vsp_request *vsp_request; > > + > > + /* Fast fail if device is being removed */ > > + if (dev->removing) > > + return -ENODEV; > > + > > + if (!az_blob_safe_file_access(filp)) { > > + az_blob_dbg("process %d(%s) changed security contexts > after" > > + " opening file descriptor\n", > > + task_tgid_vnr(current), current->comm); > > + return -EACCES; > > + } > > + > > + if (copy_from_user(&request, request_user, sizeof(request))) { > > + az_blob_dbg("don't have permission to user provided > buffer\n"); > > + return -EFAULT; > > + } > > + > > + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u > request_len %u" > > + " response_len %u data_len %u request_buffer %llx " > > + "response_buffer %llx data_buffer %llx\n", > > + &request.guid, request.timeout, request.request_len, > > + request.response_len, request.data_len, > request.request_buffer, > > + request.response_buffer, request.data_buffer); > > + > > + if (!request.request_len || !request.response_len) > > + return -EINVAL; > > + > > + if (request.data_len && request.data_len < request.data_valid) > > + return -EINVAL; > > + > > + if (request.data_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES || > > + request.request_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES || > > + request.response_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES) > > + return -EINVAL; > > + > > + init_completion(&request_ctx.wait_vsp); > > + request_ctx.request = &request; > > + > > + /* > > + * Need to set rw to READ to have page table set up for passing to > VSP. > > + * Setting it to WRITE will cause the page table entry not allocated > > + * as it's waiting on Copy-On-Write on next page fault. This doesn't > > + * work in this scenario because VSP wants all the pages to be > present. > > + */ > > + ret = get_buffer_pages(READ, (void __user > *)request.request_buffer, > > + request.request_len, &request_pages, > > + &request_start, &request_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + > > + ret = get_buffer_pages(READ, (void __user > *)request.response_buffer, > > + request.response_len, &response_pages, > > + &response_start, &response_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + > > + if (request.data_len) { > > + ret = get_buffer_pages(READ, > > + (void __user *)request.data_buffer, > > + request.data_len, &data_pages, > > + &data_start, &data_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + } > > + > > + total_num_pages = request_num_pages + response_num_pages + > > + data_num_pages; > > + if (total_num_pages > AZ_BLOB_MAX_PAGES) { > > + az_blob_dbg("number of DMA pages %lu buffer > exceeding %u\n", > > + total_num_pages, AZ_BLOB_MAX_PAGES); > > + ret = -EINVAL; > > + goto get_user_page_failed; > > + } > > + > > + /* Construct a VMBUS packet and send it over to VSP */ > > + desc_size = struct_size(desc, range.pfn_array, total_num_pages); > > + desc = kzalloc(desc_size, GFP_KERNEL); > > + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL); > > + if (!desc || !vsp_request) { > > + kfree(desc); > > + kfree(vsp_request); > > + ret = -ENOMEM; > > + goto get_user_page_failed; > > + } > > + > > + desc->range.offset = 0; > > + desc->range.len = total_num_pages * PAGE_SIZE; > > + pfn_array = desc->range.pfn_array; > > + page_idx = 0; > > + > > + if (request.data_len) { > > + fill_in_page_buffer(pfn_array, &page_idx, data_pages, > > + data_num_pages); > > + vsp_request->data_buffer_offset = data_start; > > + vsp_request->data_buffer_length = request.data_len; > > + vsp_request->data_buffer_valid = request.data_valid; > > + } > > + > > + fill_in_page_buffer(pfn_array, &page_idx, request_pages, > > + request_num_pages); > > + vsp_request->request_buffer_offset = request_start + > > + data_num_pages * > PAGE_SIZE; > > + vsp_request->request_buffer_length = request.request_len; > > + > > + fill_in_page_buffer(pfn_array, &page_idx, response_pages, > > + response_num_pages); > > + vsp_request->response_buffer_offset = response_start + > > + (data_num_pages + request_num_pages) * PAGE_SIZE; > > + vsp_request->response_buffer_length = request.response_len; > > + > > + vsp_request->version = 0; > > + vsp_request->timeout_ms = request.timeout; > > + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST; > > + guid_copy(&vsp_request->transaction_id, &request.guid); > > + > > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags); > > + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests); > > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags); > > I would think that this function is never called with interrupts disabled, so the > simpler spin_lock()/spin_unlock() functions could be used. > > > + > > + az_blob_dbg("sending request to VSP\n"); > > + az_blob_dbg("desc_size %u desc->range.len %u desc- > >range.offset %u\n", > > + desc_size, desc->range.len, desc->range.offset); > > + az_blob_dbg("vsp_request data_buffer_offset %u > data_buffer_length %u " > > + "data_buffer_valid %u request_buffer_offset %u " > > + "request_buffer_length %u response_buffer_offset %u " > > + "response_buffer_length %u\n", > > + vsp_request->data_buffer_offset, > > + vsp_request->data_buffer_length, > > + vsp_request->data_buffer_valid, > > + vsp_request->request_buffer_offset, > > + vsp_request->request_buffer_length, > > + vsp_request->response_buffer_offset, > > + vsp_request->response_buffer_length); > > + > > + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, > desc_size, > > + vsp_request, sizeof(*vsp_request), > > + (u64)&request_ctx); > > + > > + kfree(desc); > > + kfree(vsp_request); > > + if (ret) > > + goto vmbus_send_failed; > > + > > + wait_for_completion(&request_ctx.wait_vsp); > > + > > + /* > > + * At this point, the response is already written to request > > + * by VMBUS completion handler, copy them to user-mode buffers > > + * and return to user-mode > > + */ > > + if (copy_to_user(&request_user->response, &request.response, > > + sizeof(request.response))) > > + ret = -EFAULT; > > + > > +vmbus_send_failed: > > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags); > > + list_del(&request_ctx.list); > > + if (list_empty(&file_ctx->vsp_pending_requests)) > > + wake_up(&file_ctx->wait_vsp_pending); > > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags); > > I would think that this function is never called with interrupts disabled, so the > simpler spin_lock()/spin_unlock() functions could be used. > > > + > > +get_user_page_failed: > > + free_buffer_pages(request_num_pages, request_pages); > > + free_buffer_pages(response_num_pages, response_pages); > > + free_buffer_pages(data_num_pages, data_pages); > > + > > + return ret; > > +} > > + > > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd, > > + unsigned long arg) > > +{ > > + switch (cmd) { > > + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST: > > + return az_blob_ioctl_user_request(filp, arg); > > + > > + default: > > + az_blob_dbg("unrecognized IOCTL code %u\n", cmd); > > + } > > + > > + return -EINVAL; > > +} > > + > > +static const struct file_operations az_blob_client_fops = { > > + .owner = THIS_MODULE, > > + .open = az_blob_fop_open, > > + .unlocked_ioctl = az_blob_fop_ioctl, > > + .release = az_blob_fop_release, > > +}; > > + > > +static struct miscdevice az_blob_misc_device = { > > + MISC_DYNAMIC_MINOR, > > + "azure_blob", > > + &az_blob_client_fops, > > +}; > > + > > +static int az_blob_show_pending_requests(struct seq_file *m, void *v) > > +{ > > + unsigned long flags, flags2; > > + struct az_blob_vsp_request_ctx *request_ctx; > > + struct az_blob_file_ctx *file_ctx; > > + > > + seq_puts(m, "List of pending requests\n"); > > + seq_puts(m, "UUID request_len response_len data_len " > > + "request_buffer response_buffer data_buffer\n"); > > + spin_lock_irqsave(&az_blob_dev.file_lock, flags); > > + list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) { > > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2); > > + list_for_each_entry(request_ctx, > > + &file_ctx->vsp_pending_requests, list) { > > + seq_printf(m, "%pUb ", &request_ctx->request- > >guid); > > + seq_printf(m, "%u ", request_ctx->request- > >request_len); > > + seq_printf(m, "%u ", > > + request_ctx->request->response_len); > > + seq_printf(m, "%u ", request_ctx->request- > >data_len); > > + seq_printf(m, "%llx ", > > + request_ctx->request->request_buffer); > > + seq_printf(m, "%llx ", > > + request_ctx->request->response_buffer); > > + seq_printf(m, "%llx\n", > > + request_ctx->request->data_buffer); > > + } > > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2); > > + } > > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags); > > + > > + return 0; > > +} > > + > > +static int az_blob_debugfs_open(struct inode *inode, struct file > > +*file) { > > + return single_open(file, az_blob_show_pending_requests, NULL); } > > + > > +static const struct file_operations az_blob_debugfs_fops = { > > + .open = az_blob_debugfs_open, > > + .read = seq_read, > > + .llseek = seq_lseek, > > + .release = seq_release > > +}; > > + > > +static void az_blob_remove_device(void) { > > + struct dentry *debugfs_root = debugfs_lookup("az_blob", NULL); > > + > > + misc_deregister(&az_blob_misc_device); > > + debugfs_remove_recursive(debugfs_lookup("pending_requests", > > + debugfs_root)); > > + debugfs_remove_recursive(debugfs_root); > > + /* At this point, we won't get any requests from user-mode */ } > > + > > +static int az_blob_create_device(struct az_blob_device *dev) { > > + int ret; > > + struct dentry *debugfs_root; > > + > > + ret = misc_register(&az_blob_misc_device); > > + if (ret) > > + return ret; > > + > > + debugfs_root = debugfs_create_dir("az_blob", NULL); > > + debugfs_create_file("pending_requests", 0400, debugfs_root, NULL, > > + &az_blob_debugfs_fops); > > + > > + return 0; > > +} > > + > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32 > > +ring_size) { > > + int ret; > > + > > + spin_lock_init(&az_blob_dev.file_lock); > > + INIT_LIST_HEAD(&az_blob_dev.file_list); > > + init_waitqueue_head(&az_blob_dev.file_wait); > > + > > + az_blob_dev.device = device; > > + device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH; > > + > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0, > > + az_blob_on_channel_callback, device->channel); > > + > > + if (ret) > > + return ret; > > + > > + hv_set_drvdata(device, &az_blob_dev); > > + > > + return ret; > > +} > > + > > +static void az_blob_remove_vmbus(struct hv_device *device) { > > + /* At this point, no VSC/VSP traffic is possible over vmbus */ > > + hv_set_drvdata(device, NULL); > > + vmbus_close(device->channel); > > +} > > + > > +static int az_blob_probe(struct hv_device *device, > > + const struct hv_vmbus_device_id *dev_id) { > > + int ret; > > + > > + ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE); > > + if (ret) { > > + az_blob_err("error connecting to VSP ret=%d\n", ret); > > + return ret; > > + } > > + > > + // create user-mode client library facing device > > + ret = az_blob_create_device(&az_blob_dev); > > + if (ret) { > > + az_blob_err("failed to create device ret=%d\n", ret); > > + az_blob_remove_vmbus(device); > > + return ret; > > + } > > + > > + az_blob_dev.removing = false; > > This line seems misplaced. As soon as az_blob_create_device() returns, > some other thread could find the device and open it. You don't want the > az_blob_fop_open() function to find the "removing" > flag set to true. So I think this line should go *before* the call to > az_blob_create_device(). > > > + az_blob_info("successfully probed device\n"); > > + > > + return 0; > > +} > > + > > +static int az_blob_remove(struct hv_device *dev) { > > + az_blob_dev.removing = true; > > + /* > > + * RCU lock guarantees that any future calls to az_blob_fop_open() > > + * can not use device resources while the inode reference of > > + * /dev/azure_blob is being held for that open, and device file is > > + * being removed from /dev. > > + */ > > + synchronize_rcu(); > > I don't think this works as you have described. While it will ensure that any > in-progress RCU read-side critical sections have completed (i.e., in > az_blob_fop_open() ), it does not prevent new read-side critical sections > from being started. So as soon as synchronize_rcu() is finished, another > thread could find and open the device, and be executing in > az_blob_fop_open(). > > But it's not clear to me that this (and the rcu_read_locks in > az_blob_fop_open) are really needed anyway. If az_blob_remove_device() > is called while one or more threads have it open, I think that's OK. Or is there > a scenario that I'm missing? I was trying to address your comment earlier. Here were your comments (1 - 7): 1) The driver is unbound, so az_blob_remove() is called. 2) az_blob_remove() sets the "removing" flag to true, and calls az_blob_remove_device(). 3) az_blob_remove_device() waits for the file_list to become empty. 4) After the file_list becomes empty, but before misc_deregister() is called, a separate thread opens the device again. 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock. 6) Before az_blob_fop_open() releases the spin lock, az block_remove_device() completes, along with az_blob_remove(). 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called, all while az_blob_fop_open() still holds the spin lock. So the spin lock get re-initialized while it is held. Between step 4 and step 5, I don't see any guarantee that az_blob_fop_open() can't run concurrently on another CPU after misc_deregister() finishes. misc_deregister() calls devtmpfs_delete_node() to remove the device file from /dev/*, but it doesn't check the return value, so the inode reference number can be non-zero after it returns, somebody may still try to open it. This check guarantees that the code can't reference any driver's internal data structures. az_blob_dev.removing is set so this code can't be entered. Resetting it after az_blob_create_device() is also for this purpose. > > > + > > + az_blob_info("removing device\n"); > > + az_blob_remove_device(); > > + > > + /* > > + * At this point, it's not possible to open more files. > > + * Wait for all the opened files to be released. > > + */ > > + wait_event(az_blob_dev.file_wait, > > +list_empty(&az_blob_dev.file_list)); > > As mentioned in my most recent comments on the previous version of the > code, I'm concerned about waiting for all open files to be released in the case > of a VMbus rescind. We definitely have to wait for all VSP operations to > complete, but that's different from waiting for the files to be closed. The > former depends on Hyper-V being well-behaved and will presumably > happen quickly in the case of a rescind. But the latter depends entirely on > user space code that is out of the kernel's control. The user space process > could hang around for hours or days with the file still open, which would > block this function from completing, and hence block the global work queue. > > Thinking about this, the core issue may be that having a single static instance > of az_blob_device is problematic if we allow the device to be removed > (either explicitly with an unbind, or implicitly with a VMbus > rescind) and then re-added. Perhaps what needs to happen is that the > removed device is allowed to continue to exist as long as user space > processes have an open file handle, but they can't perform any operations > because the "removing" flag is set (and stays set). > If the device is re-added, then a new instance of az_blob_device should be > created, and whether or not the old instance is still hanging around is > irrelevant. I agree dynamic device objects is the way to go. Will implement this. > > > + > > + az_blob_remove_vmbus(dev); > > + return 0; > > +} > > + > > +static struct hv_driver az_blob_drv = { > > + .name = KBUILD_MODNAME, > > + .id_table = id_table, > > + .probe = az_blob_probe, > > + .remove = az_blob_remove, > > + .driver = { > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS, > > + }, > > +}; > > + > > +static int __init az_blob_drv_init(void) { > > + return vmbus_driver_register(&az_blob_drv); > > +} > > + > > +static void __exit az_blob_drv_exit(void) { > > + vmbus_driver_unregister(&az_blob_drv); > > +} > > + > > +MODULE_LICENSE("Dual BSD/GPL"); > > +MODULE_DESCRIPTION("Microsoft Azure Blob driver"); > > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit); > > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c > > index 705e95d..3095611 100644 > > --- a/drivers/hv/channel_mgmt.c > > +++ b/drivers/hv/channel_mgmt.c > > @@ -154,6 +154,13 @@ > > .allowed_in_isolated = false, > > }, > > > > + /* Azure Blob */ > > + { .dev_type = HV_AZURE_BLOB, > > + HV_AZURE_BLOB_GUID, > > + .perf_device = false, > > + .allowed_in_isolated = false, > > + }, > > + > > /* Unknown GUID */ > > { .dev_type = HV_UNKNOWN, > > .perf_device = false, > > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index > > d1e59db..ac31362 100644 > > --- a/include/linux/hyperv.h > > +++ b/include/linux/hyperv.h > > @@ -772,6 +772,7 @@ enum vmbus_device_type { > > HV_FCOPY, > > HV_BACKUP, > > HV_DM, > > + HV_AZURE_BLOB, > > HV_UNKNOWN, > > }; > > > > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource > **new, struct hv_device *device_obj, > > 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f) > > > > /* > > + * Azure Blob GUID > > + * {0590b792-db64-45cc-81db-b8d70c577c9e} > > + */ > > +#define HV_AZURE_BLOB_GUID \ > > + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \ > > + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e) > > + > > +/* > > * Shutdown GUID > > * {0e0b6031-5213-4934-818b-38d90ced39db} > > */ > > diff --git a/include/uapi/misc/azure_blob.h > > b/include/uapi/misc/azure_blob.h new file mode 100644 index > > 0000000..f4168679 > > --- /dev/null > > +++ b/include/uapi/misc/azure_blob.h > > @@ -0,0 +1,34 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH > > +Linux-syscall-note */ > > +/* Copyright (c) Microsoft Corporation. */ > > + > > +#ifndef _AZ_BLOB_H > > +#define _AZ_BLOB_H > > + > > +#include <linux/kernel.h> > > +#include <linux/uuid.h> > > + > > +/* user-mode sync request sent through ioctl */ struct > > +az_blob_request_sync_response { > > + __u32 status; > > + __u32 response_len; > > +}; > > + > > +struct az_blob_request_sync { > > + guid_t guid; > > + __u32 timeout; > > + __u32 request_len; > > + __u32 response_len; > > + __u32 data_len; > > + __u32 data_valid; > > + __aligned_u64 request_buffer; > > Is there an implied 32-bit padding field before "request_buffer"? > It seems like "yes", since there are five 32-bit field preceding. > Would it be better to explicitly list that padding field? > > > + __aligned_u64 response_buffer; > > + __aligned_u64 data_buffer; > > + struct az_blob_request_sync_response response; }; > > + > > +#define AZ_BLOB_MAGIC_NUMBER 'R' > > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \ > > + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \ > > + struct az_blob_request_sync) > > + > > +#endif /* define _AZ_BLOB_H */ > > -- > > 1.8.3.1