RE: [Patch v3 2/3] Drivers: hv: add Azure Blob driver

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> Subject: RE: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
> 
> From: longli@xxxxxxxxxxxxxxxxx <longli@xxxxxxxxxxxxxxxxx> Sent:
> Tuesday, July 13, 2021 7:45 PM
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> >
> (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&amp;data=04%7
> >
> C01%7Clongli%40microsoft.com%7C852590ffe972466d24b308d948723d8c%7C
> 72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637620477769866945%7CUnk
> nown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL
> CJXVC
> >
> I6Mn0%3D%7C1000&amp;sdata=PZYfSVoLanlTv%2Fi%2Bks4a1IMM0cMiRxP2
> poypgs20
> > S7c%3D&amp;reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
> >
> > This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
> >
> > Cc: Jonathan Corbet <corbet@xxxxxxx>
> > Cc: "K. Y. Srinivasan" <kys@xxxxxxxxxxxxx>
> > Cc: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>
> > Cc: Stephen Hemminger <sthemmin@xxxxxxxxxxxxx>
> > Cc: Wei Liu <wei.liu@xxxxxxxxxx>
> > Cc: Dexuan Cui <decui@xxxxxxxxxxxxx>
> > Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> > Cc: Bjorn Andersson <bjorn.andersson@xxxxxxxxxx>
> > Cc: Hans de Goede <hdegoede@xxxxxxxxxx>
> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> > Cc: Maximilian Luz <luzmaximilian@xxxxxxxxx>
> > Cc: Mike Rapoport <rppt@xxxxxxxxxx>
> > Cc: Ben Widawsky <ben.widawsky@xxxxxxxxx>
> > Cc: Jiri Slaby <jirislaby@xxxxxxxxxx>
> > Cc: Andra Paraschiv <andraprs@xxxxxxxxxx>
> > Cc: Siddharth Gupta <sidgup@xxxxxxxxxxxxxx>
> > Cc: Hannes Reinecke <hare@xxxxxxx>
> > Cc: linux-doc@xxxxxxxxxxxxxxx
> > Signed-off-by: Long Li <longli@xxxxxxxxxxxxx>
> > ---
> >  Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
> >  drivers/hv/Kconfig                                 |  10 +
> >  drivers/hv/Makefile                                |   1 +
> >  drivers/hv/azure_blob.c                            | 625 +++++++++++++++++++++
> >  drivers/hv/channel_mgmt.c                          |   7 +
> >  include/linux/hyperv.h                             |   9 +
> >  include/uapi/misc/azure_blob.h                     |  34 ++
> >  7 files changed, 688 insertions(+)
> >  create mode 100644 drivers/hv/azure_blob.c  create mode 100644
> > include/uapi/misc/azure_blob.h
> >
> > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > index 9bfc2b5..d3c2a90 100644
> > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > @@ -180,6 +180,8 @@ Code  Seq#    Include File
> Comments
> >  'R'   01     linux/rfkill.h                                          conflict!
> >  'R'   C0-DF  net/bluetooth/rfcomm.h
> >  'R'   E0     uapi/linux/fsl_mc.h
> > +'R'   F0-FF  uapi/misc/azure_blob.h                                  Microsoft Azure Blob
> driver
> > +
> > +<mailto:longli@xxxxxxxxxxxxx>
> >  'S'   all    linux/cdrom.h                                           conflict!
> >  'S'   80-81  scsi/scsi_ioctl.h                                       conflict!
> >  'S'   82-FF  scsi/scsi.h                                             conflict!
> > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index
> > 66c794d..e08b8d3 100644
> > --- a/drivers/hv/Kconfig
> > +++ b/drivers/hv/Kconfig
> > @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> >  	help
> >  	  Select this option to enable Hyper-V Balloon driver.
> >
> > +config HYPERV_AZURE_BLOB
> > +	tristate "Microsoft Azure Blob driver"
> > +	depends on HYPERV && X86_64
> > +	help
> > +	  Select this option to enable Microsoft Azure Blob driver.
> > +
> > +	  This driver supports accelerated Microsoft Azure Blob access.
> > +	  To compile this driver as a module, choose M here. The module will
> be
> > +	  called azure_blob.
> > +
> >  endmenu
> > diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index
> > 94daf82..a322575 100644
> > --- a/drivers/hv/Makefile
> > +++ b/drivers/hv/Makefile
> > @@ -2,6 +2,7 @@
> >  obj-$(CONFIG_HYPERV)		+= hv_vmbus.o
> >  obj-$(CONFIG_HYPERV_UTILS)	+= hv_utils.o
> >  obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
> > +obj-$(CONFIG_HYPERV_AZURE_BLOB)	+= azure_blob.o
> >
> >  CFLAGS_hv_trace.o = -I$(src)
> >  CFLAGS_hv_balloon.o = -I$(src)
> > diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c new
> > file mode 100644 index 0000000..5367d5e
> > --- /dev/null
> > +++ b/drivers/hv/azure_blob.c
> > @@ -0,0 +1,625 @@
> > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > +Linux-syscall-note
> > +/* Copyright (c) Microsoft Corporation. */
> > +
> > +#include <uapi/misc/azure_blob.h>
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/slab.h>
> > +#include <linux/cred.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/hyperv.h>
> > +#include <linux/miscdevice.h>
> > +#include <linux/uio.h>
> > +
> > +struct az_blob_device {
> > +	struct hv_device *device;
> > +
> > +	/* Opened files maintained by this device */
> > +	struct list_head file_list;
> > +	/* Lock for protecting file_list */
> > +	spinlock_t file_lock;
> > +	wait_queue_head_t file_wait;
> > +
> > +	bool removing;
> > +};
> > +
> > +/* VSP messages */
> > +enum az_blob_vsp_request_type {
> > +	AZ_BLOB_DRIVER_REQUEST_FIRST     = 0x100,
> > +	AZ_BLOB_DRIVER_USER_REQUEST      = 0x100,
> > +	AZ_BLOB_DRIVER_REGISTER_BUFFER   = 0x101,
> > +	AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102, };
> > +
> > +/* VSC->VSP request */
> > +struct az_blob_vsp_request {
> > +	u32 version;
> > +	u32 timeout_ms;
> > +	u32 data_buffer_offset;
> > +	u32 data_buffer_length;
> > +	u32 data_buffer_valid;
> > +	u32 operation_type;
> > +	u32 request_buffer_offset;
> > +	u32 request_buffer_length;
> > +	u32 response_buffer_offset;
> > +	u32 response_buffer_length;
> > +	guid_t transaction_id;
> > +} __packed;
> > +
> > +/* VSP->VSC response */
> > +struct az_blob_vsp_response {
> > +	u32 length;
> > +	u32 error;
> > +	u32 response_len;
> > +} __packed;
> > +
> > +struct az_blob_vsp_request_ctx {
> > +	struct list_head list;
> > +	struct completion wait_vsp;
> > +	struct az_blob_request_sync *request; };
> > +
> > +struct az_blob_file_ctx {
> > +	struct list_head list;
> > +
> > +	/* List of pending requests to VSP */
> > +	struct list_head vsp_pending_requests;
> > +	/* Lock for protecting vsp_pending_requests */
> > +	spinlock_t vsp_pending_lock;
> > +	wait_queue_head_t wait_vsp_pending;
> > +};
> > +
> > +/* The maximum number of pages we can pass to VSP in a single packet
> > +*/ #define AZ_BLOB_MAX_PAGES 8192
> > +
> > +/* The one and only one */
> > +static struct az_blob_device az_blob_dev;
> > +
> > +/* Ring buffer size in bytes */
> > +#define AZ_BLOB_RING_SIZE (128 * 1024)
> > +
> > +/* System wide device queue depth */
> > +#define AZ_BLOB_QUEUE_DEPTH 1024
> > +
> > +static const struct hv_vmbus_device_id id_table[] = {
> > +	{ HV_AZURE_BLOB_GUID,
> > +	  .driver_data = 0
> > +	},
> > +	{ },
> > +};
> > +
> > +#define az_blob_dbg(fmt, args...)
> > +dev_dbg(&az_blob_dev.device->device, fmt, ##args) #define
> > +az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt,
> > +##args) #define az_blob_warn(fmt, args...)
> > +dev_warn(&az_blob_dev.device->device, fmt, ##args) #define
> > +az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt,
> > +##args)
> > +
> > +static void az_blob_on_channel_callback(void *context) {
> > +	struct vmbus_channel *channel = (struct vmbus_channel *)context;
> > +	const struct vmpacket_descriptor *desc;
> > +
> > +	az_blob_dbg("entering interrupt from vmbus\n");
> > +	foreach_vmbus_pkt(desc, channel) {
> > +		struct az_blob_vsp_request_ctx *request_ctx;
> > +		struct az_blob_vsp_response *response;
> > +		u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> > +						  desc->trans_id);
> > +		if (cmd_rqst == VMBUS_RQST_ERROR) {
> > +			az_blob_err("incorrect transaction id %llu\n",
> > +				    desc->trans_id);
> > +			continue;
> > +		}
> > +		request_ctx = (struct az_blob_vsp_request_ctx *)cmd_rqst;
> > +		response = hv_pkt_data(desc);
> > +
> > +		az_blob_dbg("got response for request %pUb status %u "
> > +			    "response_len %u\n",
> > +			    &request_ctx->request->guid, response->error,
> > +			    response->response_len);
> > +		request_ctx->request->response.status = response->error;
> > +		request_ctx->request->response.response_len =
> > +			response->response_len;
> > +		complete(&request_ctx->wait_vsp);
> > +	}
> > +}
> > +
> > +static int az_blob_fop_open(struct inode *inode, struct file *file) {
> > +	struct az_blob_file_ctx *file_ctx;
> > +	unsigned long flags;
> > +
> > +	file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
> > +	if (!file_ctx)
> > +		return -ENOMEM;
> > +
> > +	rcu_read_lock();
> > +
> > +	if (az_blob_dev.removing) {
> > +		rcu_read_unlock();
> > +		kfree(file_ctx);
> > +		return -ENODEV;
> > +	}
> > +
> > +	INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
> > +	init_waitqueue_head(&file_ctx->wait_vsp_pending);
> > +	spin_lock_init(&file_ctx->vsp_pending_lock);
> > +	file->private_data = file_ctx;
> > +
> > +	spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > +	list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
> > +	spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> 
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
> 
> > +
> > +	rcu_read_unlock();
> > +
> > +	return 0;
> > +}
> > +
> > +static int az_blob_fop_release(struct inode *inode, struct file
> > +*file) {
> > +	struct az_blob_file_ctx *file_ctx = file->private_data;
> > +	unsigned long flags;
> > +
> > +	wait_event(file_ctx->wait_vsp_pending,
> > +		   list_empty(&file_ctx->vsp_pending_requests));
> > +
> > +	spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > +	list_del(&file_ctx->list);
> > +	if (list_empty(&az_blob_dev.file_list))
> > +		wake_up(&az_blob_dev.file_wait);
> > +	spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> 
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
> 
> > +
> > +	kfree(file_ctx);
> > +
> > +	return 0;
> > +}
> > +
> > +static inline bool az_blob_safe_file_access(struct file *file) {
> > +	return file->f_cred == current_cred() && !uaccess_kernel(); }
> > +
> > +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
> > +			    struct page ***ppages, size_t *start,
> > +			    size_t *num_pages)
> > +{
> > +	struct iovec iov;
> > +	struct iov_iter iter;
> > +	int ret;
> > +	ssize_t result;
> > +	struct page **pages;
> > +
> > +	ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
> > +	if (ret) {
> > +		az_blob_dbg("request buffer access error %d\n", ret);
> > +		return ret;
> > +	}
> > +	az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
> > +		    iter.type, iter.iov_offset, iter.count, iter.nr_segs);
> > +
> > +	result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
> > +	if (result < 0) {
> > +		az_blob_dbg("failed to pin user pages result=%ld\n", result);
> > +		return result;
> > +	}
> > +	if (result != buffer_len) {
> > +		az_blob_dbg("can't pin user pages requested %d got %ld\n",
> > +			    buffer_len, result);
> > +		return -EFAULT;
> > +	}
> > +
> > +	*ppages = pages;
> > +	*num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
> > +	return 0;
> > +}
> > +
> > +static void fill_in_page_buffer(u64 *pfn_array, int *index,
> > +				struct page **pages, unsigned long
> num_pages) {
> > +	int i, page_idx = *index;
> > +
> > +	for (i = 0; i < num_pages; i++)
> > +		pfn_array[page_idx++] = page_to_pfn(pages[i]);
> > +	*index = page_idx;
> > +}
> > +
> > +static void free_buffer_pages(size_t num_pages, struct page **pages)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < num_pages; i++)
> > +		if (pages[i])
> > +			put_page(pages[i]);
> > +	kvfree(pages);
> > +}
> > +
> > +static long az_blob_ioctl_user_request(struct file *filp, unsigned
> > +long arg) {
> > +	struct az_blob_device *dev = &az_blob_dev;
> > +	struct az_blob_file_ctx *file_ctx = filp->private_data;
> > +	struct az_blob_request_sync __user *request_user =
> > +		(struct az_blob_request_sync __user *)arg;
> > +	struct az_blob_request_sync request;
> > +	struct az_blob_vsp_request_ctx request_ctx;
> > +	unsigned long flags;
> > +	int ret;
> > +	size_t request_start, request_num_pages = 0;
> > +	size_t response_start, response_num_pages = 0;
> > +	size_t data_start, data_num_pages = 0, total_num_pages;
> > +	struct page **request_pages = NULL, **response_pages = NULL;
> > +	struct page **data_pages = NULL;
> > +	struct vmbus_packet_mpb_array *desc;
> > +	u64 *pfn_array;
> > +	int desc_size;
> > +	int page_idx;
> > +	struct az_blob_vsp_request *vsp_request;
> > +
> > +	/* Fast fail if device is being removed */
> > +	if (dev->removing)
> > +		return -ENODEV;
> > +
> > +	if (!az_blob_safe_file_access(filp)) {
> > +		az_blob_dbg("process %d(%s) changed security contexts
> after"
> > +			    " opening file descriptor\n",
> > +			    task_tgid_vnr(current), current->comm);
> > +		return -EACCES;
> > +	}
> > +
> > +	if (copy_from_user(&request, request_user, sizeof(request))) {
> > +		az_blob_dbg("don't have permission to user provided
> buffer\n");
> > +		return -EFAULT;
> > +	}
> > +
> > +	az_blob_dbg("az_blob ioctl request guid %pUb timeout %u
> request_len %u"
> > +		    " response_len %u data_len %u request_buffer %llx "
> > +		    "response_buffer %llx data_buffer %llx\n",
> > +		    &request.guid, request.timeout, request.request_len,
> > +		    request.response_len, request.data_len,
> request.request_buffer,
> > +		    request.response_buffer, request.data_buffer);
> > +
> > +	if (!request.request_len || !request.response_len)
> > +		return -EINVAL;
> > +
> > +	if (request.data_len && request.data_len < request.data_valid)
> > +		return -EINVAL;
> > +
> > +	if (request.data_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> > +	    request.request_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> > +	    request.response_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES)
> > +		return -EINVAL;
> > +
> > +	init_completion(&request_ctx.wait_vsp);
> > +	request_ctx.request = &request;
> > +
> > +	/*
> > +	 * Need to set rw to READ to have page table set up for passing to
> VSP.
> > +	 * Setting it to WRITE will cause the page table entry not allocated
> > +	 * as it's waiting on Copy-On-Write on next page fault. This doesn't
> > +	 * work in this scenario because VSP wants all the pages to be
> present.
> > +	 */
> > +	ret = get_buffer_pages(READ, (void __user
> *)request.request_buffer,
> > +			       request.request_len, &request_pages,
> > +			       &request_start, &request_num_pages);
> > +	if (ret)
> > +		goto get_user_page_failed;
> > +
> > +	ret = get_buffer_pages(READ, (void __user
> *)request.response_buffer,
> > +			       request.response_len, &response_pages,
> > +			       &response_start, &response_num_pages);
> > +	if (ret)
> > +		goto get_user_page_failed;
> > +
> > +	if (request.data_len) {
> > +		ret = get_buffer_pages(READ,
> > +				       (void __user *)request.data_buffer,
> > +				       request.data_len, &data_pages,
> > +				       &data_start, &data_num_pages);
> > +		if (ret)
> > +			goto get_user_page_failed;
> > +	}
> > +
> > +	total_num_pages = request_num_pages + response_num_pages +
> > +				data_num_pages;
> > +	if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> > +		az_blob_dbg("number of DMA pages %lu buffer
> exceeding %u\n",
> > +			    total_num_pages, AZ_BLOB_MAX_PAGES);
> > +		ret = -EINVAL;
> > +		goto get_user_page_failed;
> > +	}
> > +
> > +	/* Construct a VMBUS packet and send it over to VSP */
> > +	desc_size = struct_size(desc, range.pfn_array, total_num_pages);
> > +	desc = kzalloc(desc_size, GFP_KERNEL);
> > +	vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> > +	if (!desc || !vsp_request) {
> > +		kfree(desc);
> > +		kfree(vsp_request);
> > +		ret = -ENOMEM;
> > +		goto get_user_page_failed;
> > +	}
> > +
> > +	desc->range.offset = 0;
> > +	desc->range.len = total_num_pages * PAGE_SIZE;
> > +	pfn_array = desc->range.pfn_array;
> > +	page_idx = 0;
> > +
> > +	if (request.data_len) {
> > +		fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> > +				    data_num_pages);
> > +		vsp_request->data_buffer_offset = data_start;
> > +		vsp_request->data_buffer_length = request.data_len;
> > +		vsp_request->data_buffer_valid = request.data_valid;
> > +	}
> > +
> > +	fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> > +			    request_num_pages);
> > +	vsp_request->request_buffer_offset = request_start +
> > +						data_num_pages *
> PAGE_SIZE;
> > +	vsp_request->request_buffer_length = request.request_len;
> > +
> > +	fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> > +			    response_num_pages);
> > +	vsp_request->response_buffer_offset = response_start +
> > +		(data_num_pages + request_num_pages) * PAGE_SIZE;
> > +	vsp_request->response_buffer_length = request.response_len;
> > +
> > +	vsp_request->version = 0;
> > +	vsp_request->timeout_ms = request.timeout;
> > +	vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> > +	guid_copy(&vsp_request->transaction_id, &request.guid);
> > +
> > +	spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > +	list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> > +	spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> 
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
> 
> > +
> > +	az_blob_dbg("sending request to VSP\n");
> > +	az_blob_dbg("desc_size %u desc->range.len %u desc-
> >range.offset %u\n",
> > +		    desc_size, desc->range.len, desc->range.offset);
> > +	az_blob_dbg("vsp_request data_buffer_offset %u
> data_buffer_length %u "
> > +		    "data_buffer_valid %u request_buffer_offset %u "
> > +		    "request_buffer_length %u response_buffer_offset %u "
> > +		    "response_buffer_length %u\n",
> > +		    vsp_request->data_buffer_offset,
> > +		    vsp_request->data_buffer_length,
> > +		    vsp_request->data_buffer_valid,
> > +		    vsp_request->request_buffer_offset,
> > +		    vsp_request->request_buffer_length,
> > +		    vsp_request->response_buffer_offset,
> > +		    vsp_request->response_buffer_length);
> > +
> > +	ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc,
> desc_size,
> > +					vsp_request, sizeof(*vsp_request),
> > +					(u64)&request_ctx);
> > +
> > +	kfree(desc);
> > +	kfree(vsp_request);
> > +	if (ret)
> > +		goto vmbus_send_failed;
> > +
> > +	wait_for_completion(&request_ctx.wait_vsp);
> > +
> > +	/*
> > +	 * At this point, the response is already written to request
> > +	 * by VMBUS completion handler, copy them to user-mode buffers
> > +	 * and return to user-mode
> > +	 */
> > +	if (copy_to_user(&request_user->response, &request.response,
> > +			 sizeof(request.response)))
> > +		ret = -EFAULT;
> > +
> > +vmbus_send_failed:
> > +	spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > +	list_del(&request_ctx.list);
> > +	if (list_empty(&file_ctx->vsp_pending_requests))
> > +		wake_up(&file_ctx->wait_vsp_pending);
> > +	spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> 
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
> 
> > +
> > +get_user_page_failed:
> > +	free_buffer_pages(request_num_pages, request_pages);
> > +	free_buffer_pages(response_num_pages, response_pages);
> > +	free_buffer_pages(data_num_pages, data_pages);
> > +
> > +	return ret;
> > +}
> > +
> > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> > +			      unsigned long arg)
> > +{
> > +	switch (cmd) {
> > +	case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> > +		return az_blob_ioctl_user_request(filp, arg);
> > +
> > +	default:
> > +		az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> > +static const struct file_operations az_blob_client_fops = {
> > +	.owner	= THIS_MODULE,
> > +	.open	= az_blob_fop_open,
> > +	.unlocked_ioctl = az_blob_fop_ioctl,
> > +	.release = az_blob_fop_release,
> > +};
> > +
> > +static struct miscdevice az_blob_misc_device = {
> > +	MISC_DYNAMIC_MINOR,
> > +	"azure_blob",
> > +	&az_blob_client_fops,
> > +};
> > +
> > +static int az_blob_show_pending_requests(struct seq_file *m, void *v)
> > +{
> > +	unsigned long flags, flags2;
> > +	struct az_blob_vsp_request_ctx *request_ctx;
> > +	struct az_blob_file_ctx *file_ctx;
> > +
> > +	seq_puts(m, "List of pending requests\n");
> > +	seq_puts(m, "UUID request_len response_len data_len "
> > +		"request_buffer response_buffer data_buffer\n");
> > +	spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > +	list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
> > +		spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
> > +		list_for_each_entry(request_ctx,
> > +				    &file_ctx->vsp_pending_requests, list) {
> > +			seq_printf(m, "%pUb ", &request_ctx->request-
> >guid);
> > +			seq_printf(m, "%u ", request_ctx->request-
> >request_len);
> > +			seq_printf(m, "%u ",
> > +				   request_ctx->request->response_len);
> > +			seq_printf(m, "%u ", request_ctx->request-
> >data_len);
> > +			seq_printf(m, "%llx ",
> > +				   request_ctx->request->request_buffer);
> > +			seq_printf(m, "%llx ",
> > +				   request_ctx->request->response_buffer);
> > +			seq_printf(m, "%llx\n",
> > +				   request_ctx->request->data_buffer);
> > +		}
> > +		spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
> > +	}
> > +	spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > +
> > +	return 0;
> > +}
> > +
> > +static int az_blob_debugfs_open(struct inode *inode, struct file
> > +*file) {
> > +	return single_open(file, az_blob_show_pending_requests, NULL); }
> > +
> > +static const struct file_operations az_blob_debugfs_fops = {
> > +	.open		= az_blob_debugfs_open,
> > +	.read		= seq_read,
> > +	.llseek		= seq_lseek,
> > +	.release	= seq_release
> > +};
> > +
> > +static void az_blob_remove_device(void) {
> > +	struct dentry *debugfs_root = debugfs_lookup("az_blob", NULL);
> > +
> > +	misc_deregister(&az_blob_misc_device);
> > +	debugfs_remove_recursive(debugfs_lookup("pending_requests",
> > +						debugfs_root));
> > +	debugfs_remove_recursive(debugfs_root);
> > +	/* At this point, we won't get any requests from user-mode */ }
> > +
> > +static int az_blob_create_device(struct az_blob_device *dev) {
> > +	int ret;
> > +	struct dentry *debugfs_root;
> > +
> > +	ret = misc_register(&az_blob_misc_device);
> > +	if (ret)
> > +		return ret;
> > +
> > +	debugfs_root = debugfs_create_dir("az_blob", NULL);
> > +	debugfs_create_file("pending_requests", 0400, debugfs_root, NULL,
> > +			    &az_blob_debugfs_fops);
> > +
> > +	return 0;
> > +}
> > +
> > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > +ring_size) {
> > +	int ret;
> > +
> > +	spin_lock_init(&az_blob_dev.file_lock);
> > +	INIT_LIST_HEAD(&az_blob_dev.file_list);
> > +	init_waitqueue_head(&az_blob_dev.file_wait);
> > +
> > +	az_blob_dev.device = device;
> > +	device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
> > +
> > +	ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > +			 az_blob_on_channel_callback, device->channel);
> > +
> > +	if (ret)
> > +		return ret;
> > +
> > +	hv_set_drvdata(device, &az_blob_dev);
> > +
> > +	return ret;
> > +}
> > +
> > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > +	/* At this point, no VSC/VSP traffic is possible over vmbus */
> > +	hv_set_drvdata(device, NULL);
> > +	vmbus_close(device->channel);
> > +}
> > +
> > +static int az_blob_probe(struct hv_device *device,
> > +			 const struct hv_vmbus_device_id *dev_id) {
> > +	int ret;
> > +
> > +	ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
> > +	if (ret) {
> > +		az_blob_err("error connecting to VSP ret=%d\n", ret);
> > +		return ret;
> > +	}
> > +
> > +	// create user-mode client library facing device
> > +	ret = az_blob_create_device(&az_blob_dev);
> > +	if (ret) {
> > +		az_blob_err("failed to create device ret=%d\n", ret);
> > +		az_blob_remove_vmbus(device);
> > +		return ret;
> > +	}
> > +
> > +	az_blob_dev.removing = false;
> 
> This line seems misplaced.  As soon as az_blob_create_device() returns,
> some other thread could find the device and open it.  You don't want the
> az_blob_fop_open() function to find the "removing"
> flag set to true.  So I think this line should go *before* the call to
> az_blob_create_device().
> 
> > +	az_blob_info("successfully probed device\n");
> > +
> > +	return 0;
> > +}
> > +
> > +static int az_blob_remove(struct hv_device *dev) {
> > +	az_blob_dev.removing = true;
> > +	/*
> > +	 * RCU lock guarantees that any future calls to az_blob_fop_open()
> > +	 * can not use device resources while the inode reference of
> > +	 * /dev/azure_blob is being held for that open, and device file is
> > +	 * being removed from /dev.
> > +	 */
> > +	synchronize_rcu();
> 
> I don't think this works as you have described.  While it will ensure that any
> in-progress RCU read-side critical sections have completed (i.e., in
> az_blob_fop_open() ), it does not prevent new read-side critical sections
> from being started.  So as soon as synchronize_rcu() is finished, another
> thread could find and open the device, and be executing in
> az_blob_fop_open().
> 
> But it's not clear to me that this (and the rcu_read_locks in
> az_blob_fop_open) are really needed anyway.  If az_blob_remove_device()
> is called while one or more threads have it open, I think that's OK.  Or is there
> a scenario that I'm missing?

I was trying to address your comment earlier. Here were your comments (1 - 7):

1) The driver is unbound, so az_blob_remove() is called.
2) az_blob_remove() sets the "removing" flag to true, and calls az_blob_remove_device().
3) az_blob_remove_device() waits for the file_list to become empty.
4) After the file_list becomes empty, but before misc_deregister() is called, a separate thread opens the device again.
5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock.
6) Before az_blob_fop_open() releases the spin lock, az
block_remove_device() completes, along with az_blob_remove().
7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called, all while az_blob_fop_open() still holds the spin lock.  So the spin lock get re-initialized while it is held.

Between step 4 and step 5, I don't see any guarantee that az_blob_fop_open() can't run concurrently on another CPU after misc_deregister() finishes. misc_deregister() calls devtmpfs_delete_node() to remove the device file from /dev/*, but it doesn't check the return value, so the inode reference number can be non-zero after it returns, somebody may still try to open it. This check guarantees that the code can't reference any driver's internal data structures. az_blob_dev.removing is set so this code can't be entered. Resetting it after az_blob_create_device() is also for this purpose.

> 
> > +
> > +	az_blob_info("removing device\n");
> > +	az_blob_remove_device();
> > +
> > +	/*
> > +	 * At this point, it's not possible to open more files.
> > +	 * Wait for all the opened files to be released.
> > +	 */
> > +	wait_event(az_blob_dev.file_wait,
> > +list_empty(&az_blob_dev.file_list));
> 
> As mentioned in my most recent comments on the previous version of the
> code, I'm concerned about waiting for all open files to be released in the case
> of a VMbus rescind.  We definitely have to wait for all VSP operations to
> complete, but that's different from waiting for the files to be closed.  The
> former depends on Hyper-V being well-behaved and will presumably
> happen quickly in the case of a rescind.  But the latter depends entirely on
> user space code that is out of the kernel's control.  The user space process
> could hang around for hours or days with the file still open, which would
> block this function from completing, and hence block the global work queue.
> 
> Thinking about this, the core issue may be that having a single static instance
> of az_blob_device is problematic if we allow the device to be removed
> (either explicitly with an unbind, or implicitly with a VMbus
> rescind) and then re-added.  Perhaps what needs to happen is that the
> removed device is allowed to continue to exist as long as user space
> processes have an open file handle, but they can't perform any operations
> because the "removing" flag is set (and stays set).
> If the device is re-added, then a new instance of az_blob_device should be
> created, and whether or not the old instance is still hanging around is
> irrelevant.

I agree dynamic device objects is the way to go. Will implement this.

> 
> > +
> > +	az_blob_remove_vmbus(dev);
> > +	return 0;
> > +}
> > +
> > +static struct hv_driver az_blob_drv = {
> > +	.name = KBUILD_MODNAME,
> > +	.id_table = id_table,
> > +	.probe = az_blob_probe,
> > +	.remove = az_blob_remove,
> > +	.driver = {
> > +		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > +	},
> > +};
> > +
> > +static int __init az_blob_drv_init(void) {
> > +	return vmbus_driver_register(&az_blob_drv);
> > +}
> > +
> > +static void __exit az_blob_drv_exit(void) {
> > +	vmbus_driver_unregister(&az_blob_drv);
> > +}
> > +
> > +MODULE_LICENSE("Dual BSD/GPL");
> > +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit);
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index 705e95d..3095611 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -154,6 +154,13 @@
> >  	  .allowed_in_isolated = false,
> >  	},
> >
> > +	/* Azure Blob */
> > +	{ .dev_type = HV_AZURE_BLOB,
> > +	  HV_AZURE_BLOB_GUID,
> > +	  .perf_device = false,
> > +	  .allowed_in_isolated = false,
> > +	},
> > +
> >  	/* Unknown GUID */
> >  	{ .dev_type = HV_UNKNOWN,
> >  	  .perf_device = false,
> > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > d1e59db..ac31362 100644
> > --- a/include/linux/hyperv.h
> > +++ b/include/linux/hyperv.h
> > @@ -772,6 +772,7 @@ enum vmbus_device_type {
> >  	HV_FCOPY,
> >  	HV_BACKUP,
> >  	HV_DM,
> > +	HV_AZURE_BLOB,
> >  	HV_UNKNOWN,
> >  };
> >
> > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource
> **new, struct hv_device *device_obj,
> >  			  0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
> >
> >  /*
> > + * Azure Blob GUID
> > + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> > + */
> > +#define HV_AZURE_BLOB_GUID \
> > +	.guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> > +			  0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> > +
> > +/*
> >   * Shutdown GUID
> >   * {0e0b6031-5213-4934-818b-38d90ced39db}
> >   */
> > diff --git a/include/uapi/misc/azure_blob.h
> > b/include/uapi/misc/azure_blob.h new file mode 100644 index
> > 0000000..f4168679
> > --- /dev/null
> > +++ b/include/uapi/misc/azure_blob.h
> > @@ -0,0 +1,34 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > +Linux-syscall-note */
> > +/* Copyright (c) Microsoft Corporation. */
> > +
> > +#ifndef _AZ_BLOB_H
> > +#define _AZ_BLOB_H
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/uuid.h>
> > +
> > +/* user-mode sync request sent through ioctl */ struct
> > +az_blob_request_sync_response {
> > +	__u32 status;
> > +	__u32 response_len;
> > +};
> > +
> > +struct az_blob_request_sync {
> > +	guid_t guid;
> > +	__u32 timeout;
> > +	__u32 request_len;
> > +	__u32 response_len;
> > +	__u32 data_len;
> > +	__u32 data_valid;
> > +	__aligned_u64 request_buffer;
> 
> Is there an implied 32-bit padding field before "request_buffer"?
> It seems like "yes", since there are five 32-bit field preceding.
> Would it be better to explicitly list that padding field?
> 
> > +	__aligned_u64 response_buffer;
> > +	__aligned_u64 data_buffer;
> > +	struct az_blob_request_sync_response response; };
> > +
> > +#define AZ_BLOB_MAGIC_NUMBER	'R'
> > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> > +		_IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> > +			struct az_blob_request_sync)
> > +
> > +#endif /* define _AZ_BLOB_H */
> > --
> > 1.8.3.1





[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux