> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver > > On 26. 06. 21, 8:30, longli@xxxxxxxxxxxxxxxxx wrote: > > Azure Blob storage provides scalable and durable data storage for Azure. > > (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu > > re.microsoft.com%2Fen- > us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7 > > > C01%7Clongli%40microsoft.com%7Cf5787b443bd04a8a2db708d93ad9ec59 > %7C72f9 > > > 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637605529942807730%7C > Unknown%7C > > > TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJ > XVC > > > I6Mn0%3D%7C1000&sdata=bu0G8il7D%2F3emZkPn8FXIHWWff0WPORF > 5CBfHAsIOI > > 4%3D&reserved=0) > > > > This driver adds support for accelerated access to Azure Blob storage. > > As an alternative to REST APIs, it provides a fast data path that uses > > host native network stack and secure direct data link for storage server > access. > > > ... > > Cc: Jiri Slaby <jirislaby@xxxxxxxxxx> > > I am just curious, why am I CCed? Anyway, some comments below: > > > --- /dev/null > > +++ b/drivers/hv/azure_blob.c > > @@ -0,0 +1,655 @@ > ... > > +static void az_blob_on_channel_callback(void *context) { > > + struct vmbus_channel *channel = (struct vmbus_channel *)context; > > Have you fed this patch through checkpatch? Yes, it didn't throw out any errors. > > > + const struct vmpacket_descriptor *desc; > > + > > + az_blob_dbg("entering interrupt from vmbus\n"); > > + foreach_vmbus_pkt(desc, channel) { > > + struct az_blob_vsp_request_ctx *request_ctx; > > + struct az_blob_vsp_response *response; > > + u64 cmd_rqst = vmbus_request_addr(&channel->requestor, > > + desc->trans_id); > > + if (cmd_rqst == VMBUS_RQST_ERROR) { > > + az_blob_err("incorrect transaction id %llu\n", > > + desc->trans_id); > > + continue; > > + } > > + request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst; > > + response = hv_pkt_data(desc); > > + > > + az_blob_dbg("got response for request %pUb status %u " > > + "response_len %u\n", > > + &request_ctx->request->guid, response->error, > > + response->response_len); > > + request_ctx->request->response.status = response->error; > > + request_ctx->request->response.response_len = > > + response->response_len; > > + complete(&request_ctx->wait_vsp); > > + } > > + > > +} > ... > > +static long az_blob_ioctl_user_request(struct file *filp, unsigned > > +long arg) { > > + struct az_blob_device *dev = &az_blob_dev; > > + struct az_blob_file_ctx *file_ctx = filp->private_data; > > + char __user *argp = (char __user *) arg; > > + struct az_blob_request_sync request; > > + struct az_blob_vsp_request_ctx request_ctx; > > + unsigned long flags; > > + int ret; > > + size_t request_start, request_num_pages = 0; > > + size_t response_start, response_num_pages = 0; > > + size_t data_start, data_num_pages = 0, total_num_pages; > > + struct page **request_pages = NULL, **response_pages = NULL; > > + struct page **data_pages = NULL; > > + struct vmbus_packet_mpb_array *desc; > > + u64 *pfn_array; > > + int desc_size; > > + int page_idx; > > + struct az_blob_vsp_request *vsp_request; > > Ugh, see further. > > > + > > + /* Fast fail if device is being removed */ > > + if (dev->removing) > > + return -ENODEV; > > + > > + if (!az_blob_safe_file_access(filp)) { > > + az_blob_dbg("process %d(%s) changed security contexts > after" > > + " opening file descriptor\n", > > + task_tgid_vnr(current), current->comm); > > + return -EACCES; > > + } > > + > > + if (copy_from_user(&request, argp, sizeof(request))) { > > + az_blob_dbg("don't have permission to user provided > buffer\n"); > > + return -EFAULT; > > + } > > + > > + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u > request_len %u" > > + " response_len %u data_len %u request_buffer %llx " > > + "response_buffer %llx data_buffer %llx\n", > > + &request.guid, request.timeout, request.request_len, > > + request.response_len, request.data_len, > request.request_buffer, > > + request.response_buffer, request.data_buffer); > > + > > + if (!request.request_len || !request.response_len) > > + return -EINVAL; > > + > > + if (request.data_len && request.data_len < request.data_valid) > > + return -EINVAL; > > + > > + init_completion(&request_ctx.wait_vsp); > > + request_ctx.request = &request; > > + > > + /* > > + * Need to set rw to READ to have page table set up for passing to VSP. > > + * Setting it to WRITE will cause the page table entry not allocated > > + * as it's waiting on Copy-On-Write on next page fault. This doesn't > > + * work in this scenario because VSP wants all the pages to be present. > > + */ > > + ret = get_buffer_pages(READ, (void __user *) request.request_buffer, > > Does this need __force for sparse not to complain? I didn't run sparse for the patch. Will look into it. Thanks for the pointer. > > > + request.request_len, &request_pages, &request_start, > > + &request_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + > > + ret = get_buffer_pages(READ, (void __user *) > request.response_buffer, > > + request.response_len, &response_pages, &response_start, > > + &response_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + > > + if (request.data_len) { > > + ret = get_buffer_pages(READ, > > + (void __user *) request.data_buffer, request.data_len, > > + &data_pages, &data_start, &data_num_pages); > > + if (ret) > > + goto get_user_page_failed; > > + } > > + > > + total_num_pages = request_num_pages + response_num_pages + > > + data_num_pages; > > + if (total_num_pages > AZ_BLOB_MAX_PAGES) { > > + az_blob_dbg("number of DMA pages %lu buffer > exceeding %u\n", > > + total_num_pages, AZ_BLOB_MAX_PAGES); > > + ret = -EINVAL; > > + goto get_user_page_failed; > > + } > > + > > + /* Construct a VMBUS packet and send it over to VSP */ > > + desc_size = sizeof(struct vmbus_packet_mpb_array) + > > + sizeof(u64) * total_num_pages; > > + desc = kzalloc(desc_size, GFP_KERNEL); > > Smells like a call for struct_size(). Yes it's a good idea to use struct_size. Will change this. > > > + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL); > > + if (!desc || !vsp_request) { > > + kfree(desc); > > + kfree(vsp_request); > > + ret = -ENOMEM; > > + goto get_user_page_failed; > > + } > > + > > + desc->range.offset = 0; > > + desc->range.len = total_num_pages * PAGE_SIZE; > > + pfn_array = desc->range.pfn_array; > > + page_idx = 0; > > + > > + if (request.data_len) { > > + fill_in_page_buffer(pfn_array, &page_idx, data_pages, > > + data_num_pages); > > + vsp_request->data_buffer_offset = data_start; > > + vsp_request->data_buffer_length = request.data_len; > > + vsp_request->data_buffer_valid = request.data_valid; > > + } > > + > > + fill_in_page_buffer(pfn_array, &page_idx, request_pages, > > + request_num_pages); > > + vsp_request->request_buffer_offset = request_start + > > + data_num_pages * PAGE_SIZE; > > + vsp_request->request_buffer_length = request.request_len; > > + > > + fill_in_page_buffer(pfn_array, &page_idx, response_pages, > > + response_num_pages); > > + vsp_request->response_buffer_offset = response_start + > > + (data_num_pages + request_num_pages) * PAGE_SIZE; > > + vsp_request->response_buffer_length = request.response_len; > > + > > + vsp_request->version = 0; > > + vsp_request->timeout_ms = request.timeout; > > + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST; > > + guid_copy(&vsp_request->transaction_id, &request.guid); > > + > > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags); > > + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests); > > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags); > > + > > + az_blob_dbg("sending request to VSP\n"); > > + az_blob_dbg("desc_size %u desc->range.len %u desc- > >range.offset %u\n", > > + desc_size, desc->range.len, desc->range.offset); > > + az_blob_dbg("vsp_request data_buffer_offset %u > data_buffer_length %u " > > + "data_buffer_valid %u request_buffer_offset %u " > > + "request_buffer_length %u response_buffer_offset %u " > > + "response_buffer_length %u\n", > > + vsp_request->data_buffer_offset, > > + vsp_request->data_buffer_length, > > + vsp_request->data_buffer_valid, > > + vsp_request->request_buffer_offset, > > + vsp_request->request_buffer_length, > > + vsp_request->response_buffer_offset, > > + vsp_request->response_buffer_length); > > + > > + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, > desc_size, > > + vsp_request, sizeof(*vsp_request), (u64) &request_ctx); > > + > > + kfree(desc); > > + kfree(vsp_request); > > + if (ret) > > + goto vmbus_send_failed; > > + > > + wait_for_completion(&request_ctx.wait_vsp); > > Provided this is ioctl, this should likely be interruptible. You don't want to > account to I/O load. The same likely for az_blob_fop_release. The reason for wait is that the memory for I/O is pinned and passed to the host for direct I/O. Now the host owns those memory, and we can't do anything to those memory until the host releases ownership. > > > + > > + /* > > + * At this point, the response is already written to request > > + * by VMBUS completion handler, copy them to user-mode buffers > > + * and return to user-mode > > + */ > > + if (copy_to_user(argp + > > + offsetof(struct az_blob_request_sync, > > + response.status), > > + &request.response.status, > > This is ugly, why don't you make argp the appropriate pointer instead of char > *? You'd need not do copy_to_user twice then, right? > > > + sizeof(request.response.status))) { > > + ret = -EFAULT; > > + goto vmbus_send_failed; > > + } > > + > > + if (copy_to_user(argp + > > + offsetof(struct az_blob_request_sync, > > + response.response_len), > > The same here. I'll fix the pointers. > > > + &request.response.response_len, > > + sizeof(request.response.response_len))) > > + ret = -EFAULT; > > + > > +vmbus_send_failed: > > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags); > > + list_del(&request_ctx.list); > > + if (list_empty(&file_ctx->vsp_pending_requests)) > > + wake_up(&file_ctx->wait_vsp_pending); > > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags); > > + > > +get_user_page_failed: > > + free_buffer_pages(request_num_pages, request_pages); > > + free_buffer_pages(response_num_pages, response_pages); > > + free_buffer_pages(data_num_pages, data_pages); > > + > > + return ret; > > +} > > This function is overly long. Care to split it (e.g. moving away the initialization > of the structs and the debug stuff)? This is true. I'm looking for ways to refactor it. > > > + > > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd, > > + unsigned long arg) > > +{ > > + long ret = -EIO; > > EINVAL would be more appropriate. Good point. I'll fix it. > > > + > > + switch (cmd) { > > + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST: > > + if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync)) > > + return -EINVAL; > > How can that happen, provided the switch (cmd) and case? I see some other drivers checking it. It's probably not needed. Will remove it. > > > + ret = az_blob_ioctl_user_request(filp, arg); > > + break; > > Simply: > return az_blob_ioctl_user_request(filp, arg); > > > + > > + default: > > + az_blob_dbg("unrecognized IOCTL code %u\n", cmd); > > + } > > + > > + return ret; > > So return -EINVAL here directly now. > > > +} > ... > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32 > > +ring_size) { > > + int ret; > ... > > + int rc; > > > Sometimes ret, sometimes rc, could you unify the two? Will fix this. > > > +static int __init az_blob_drv_init(void) { > > + int ret; > > + > > + ret = vmbus_driver_register(&az_blob_drv); > > + return ret; > > Simply: > return vmbus_driver_register(&az_blob_drv); > > regards, > -- > -- > js > suse labs Thank you! Long