In this episode, Rusty tries to NAPI-ize the driver and discovers that virtio callbacks are a bad idea: NAPI needs to turn interrupts off and still be able to query for new incoming packets. Changes to core: 1) Back to "interrupt" model with get_inbuf()/get_outbuf() calls. 2) Clearer rules for locking: in calls cannot overlap, out calls cannot overlap, but in can overlap out. 3) Methods for suppressing/enabling "interrupt" calls. Changes to example net driver: 1) NAPI, locking is now correct (and there is none) Changes to example block driver: 1) Relay SCSI ioctls (particularly CDROMEJECT) for optional server support (VIRTIO_BLK_T_SCSI_CMD). 2) /dev/vb -> /dev/vd. 3) Barrier support. 4) Header w/ definitions can be included from userspace. Here's the inter-diff for the three: diff -u b/include/linux/virtio.h b/include/linux/virtio.h --- b/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -7,69 +7,109 @@ /** * virtio_device - description and routines to drive a virtual device. - * @lock: the lock to hold before calling any functions. * @dev: the underlying struct device. * @ops: the operations for this virtual device. + * @driver_ops: set by the driver for callbacks. * @priv: private pointer for the driver to use. */ struct virtio_device { - spinlock_t lock; struct device *dev; struct virtio_ops *ops; + struct virtio_driver_ops *driver_ops; void *priv; }; /** + * virtio_driver_ops - driver callbacks for a virtual device. + * @in: inbufs have been completed. + * Usually called from an interrupt handler. + * Return false to suppress further inbuf callbacks. + * @out: outbufs have been completed. + * Usually called from an interrupt handler. + * Return false to suppress further outbuf callbacks. + */ +struct virtio_driver_ops { + bool (*in)(struct virtio_device *dev); + bool (*out)(struct virtio_device *dev); +}; + +enum virtio_dir { + VIRTIO_IN = 0x1, + VIRTIO_OUT = 0x2, +}; + +/** * virtio_ops - virtio abstraction layer * @add_outbuf: prepare to send data to the other end: * vdev: the virtio_device * sg: the description of the buffer(s). * num: the size of the sg array. - * cb: the function to call once the outbuf is finished & detached. - * data: the token to hand to the cb function. - * Returns a unique id or an error. Note that the callback will be - * called with the lock held, and possibly in an interrupt handler. + * data: the token returned by the get_outbuf function. + * Returns a unique id or an error. * @add_inbuf: prepare to receive data from the other end: * vdev: the virtio_device * sg: the description of the buffer(s). * num: the size of the sg array. - * cb: the function to call once the inbuf is finished & detached. - * data: the token to hand to the cb function. - * Returns a unique id or an error (eg. -ENOSPC). Note that the - * callback will be called with the lock held, and possibly in an - * interrupt handler. - * @sync: update after add_inbuf/add_outbuf + * data: the token returned by the get_inbuf function. + * Returns a unique id or an error (eg. -ENOSPC). + * @sync: update after add_inbuf and/or add_outbuf * vdev: the virtio_device we're talking about. + * inout: VIRTIO_IN and/or VIRTIO_OUT * After one or more add_inbuf/add_outbuf calls, invoke this to kick * the virtio layer. + * @get_outbuf: get the next used outbuf. + * vdev: the virtio_device we're talking about. + * len: the length written into the outbuf + * Returns NULL or the "data" token handed to add_outbuf (which has been + * detached). + * @get_inbuf: get the next used inbuf. + * vdev: the virtio_device we're talking about. + * len: the length read from the inbuf + * Returns NULL or the "data" token handed to add_inbuf (which has been + * detached). * @detach_outbuf: make sure sent sg can no longer be read. * vdev: the virtio_device we're talking about. * id: the id returned from add_outbuf. - * This is not necessary (or valid!) if the outbuf callback has - * already fired. + * This is usually used for shutdown. Don't try to detach twice. * @detach_inbuf: make sure sent sg can no longer be written to. * vdev: the virtio_device we're talking about. * id: the id returned from add_inbuf. - * This is not necessary (or valid!) if the outbuf callback has - * already fired. + * This is usually used for shutdown. Don't try to detach twice. + * @restart_in: restart calls to driver_ops->in after it returned false. + * vdev: the virtio_device we're talking about. + * This returns "false" (and doesn't re-enable) if there are pending + * inbufs, to avoid a race. + * @restart_out: restart calls to driver_ops->out after it returned false. + * vdev: the virtio_device we're talking about. + * This returns "false" (and doesn't re-enable) if there are pending + * outbufs, to avoid a race. + * + * Locking rules are straightforward: the driver is responsible for + * locking. Outbuf operations can be called in parallel to inbuf + * operations, but no two outbuf operations nor two inbuf operations + * may be invoked simultaneously. + * + * All operations can be called in any context. */ struct virtio_ops { unsigned long (*add_outbuf)(struct virtio_device *vdev, const struct scatterlist sg[], unsigned int num, - void (*cb)(struct virtio_device *vdev, - void *data, unsigned len), void *data); unsigned long (*add_inbuf)(struct virtio_device *vdev, struct scatterlist sg[], unsigned int num, - void (*cb)(struct virtio_device *vdev, - void *data, unsigned len), void *data); - void (*sync)(struct virtio_device *vdev); + void (*sync)(struct virtio_device *vdev, enum virtio_dir inout); + + void *(*get_outbuf)(struct virtio_device *vdev, unsigned int *len); + void *(*get_inbuf)(struct virtio_device *vdev, unsigned int *len); void (*detach_outbuf)(struct virtio_device *vdev, unsigned long id); void (*detach_inbuf)(struct virtio_device *vdev, unsigned long id); + + bool (*restart_in)(struct virtio_device *vdev); + bool (*restart_out)(struct virtio_device *vdev); }; #endif /* _LINUX_VIRTIO_H */ diff -u b/drivers/net/virtio_net.c b/drivers/net/virtio_net.c --- b/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -33,29 +33,21 @@ struct virtio_device *vdev; struct net_device *ndev; + /* Number of input buffers, and max we've ever had. */ + unsigned int num, max; + /* Receive & send queues. */ struct sk_buff_head recv; struct sk_buff_head send; - - /* Transmitted packets waiting to be freed */ - struct sk_buff_head free; }; -static void skb_xmit_done(struct virtio_device *vdev, void *_skb, unsigned len) +static bool skb_xmit_done(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; - struct sk_buff *skb = _skb; - - assert_spin_locked(&vdev->lock); - - __skb_unlink(skb, &vi->send); - vi->ndev->stats.tx_bytes += len; - vi->ndev->stats.tx_packets++; - __skb_queue_head(&vi->free, skb); - pr_debug("Sent skb %p\n", skb); /* In case we were waiting for output buffers. */ netif_wake_queue(vi->ndev); + return true; } static void receive_skb(struct net_device *dev, struct sk_buff *skb, @@ -78,48 +70,90 @@ netif_rx(skb); } -static void skb_recv_done(struct virtio_device *, void *, unsigned); -static int try_fill_recv(struct virtnet_info *vi) +static void try_fill_recv(struct virtnet_info *vi) { struct sk_buff *skb; struct scatterlist sg[MAX_SKB_FRAGS]; - unsigned long num, id; - - assert_spin_locked(&vi->vdev->lock); + unsigned long sgnum, id; - skb = netdev_alloc_skb(vi->ndev, MAX_PACKET_LEN); - if (unlikely(!skb)) - return -ENOMEM; + for (;;) { + skb = netdev_alloc_skb(vi->ndev, MAX_PACKET_LEN); + if (unlikely(!skb)) + break; + + skb_put(skb, MAX_PACKET_LEN); + sgnum = skb_to_sgvec(skb, sg, 0, skb->len); + skb_queue_head(&vi->recv, skb); + + id = vi->vdev->ops->add_inbuf(vi->vdev, sg, sgnum, skb); + if (IS_ERR_VALUE(id)) { + skb_unlink(skb, &vi->recv); + kfree_skb(skb); + break; + } + vi->num++; + } + if (unlikely(vi->num > vi->max)) + vi->max = vi->num; + vi->vdev->ops->sync(vi->vdev, VIRTIO_IN); +} - skb_put(skb, MAX_PACKET_LEN); - num = skb_to_sgvec(skb, sg, 0, skb->len); - skb_queue_head(&vi->recv, skb); +static bool skb_recv_done(struct virtio_device *vdev) +{ + struct virtnet_info *vi = vdev->priv; - id = vi->vdev->ops->add_inbuf(vi->vdev, sg, num, skb_recv_done, skb); - if (IS_ERR_VALUE(id)) { - skb_unlink(skb, &vi->recv); - kfree_skb(skb); - return id; - } - return 0; + netif_rx_schedule(vi->ndev); + /* Suppress further interrupts. */ + return false; } -static void skb_recv_done(struct virtio_device *vdev, void *_skb, unsigned len) +static int virtnet_poll(struct net_device *dev, int *budget) { - struct virtnet_info *vi = vdev->priv; - struct sk_buff *skb = _skb; + struct virtnet_info *vi = netdev_priv(dev); + struct sk_buff *skb = NULL; + unsigned int len, received = 0; - assert_spin_locked(&vdev->lock); - __skb_unlink(skb, &vi->recv); - receive_skb(vi->ndev, skb, len); - try_fill_recv(vi); +again: + while (received < dev->quota && + (skb = vi->vdev->ops->get_inbuf(vi->vdev, &len)) != NULL) { + __skb_unlink(skb, &vi->recv); + receive_skb(vi->ndev, skb, len); + vi->num--; + received++; + } + + dev->quota -= received; + *budget -= received; + + /* FIXME: If we oom and completely run out of inbufs, we need + * to start a timer trying to fill more. */ + if (vi->num < vi->max / 2) + try_fill_recv(vi); + + /* Still more work to do? */ + if (skb) + return 1; /* not done */ + + netif_rx_complete(dev); + if (unlikely(!vi->vdev->ops->restart_in(vi->vdev)) + && netif_rx_reschedule(dev, received)) + goto again; + + return 0; } -static void free_old_skbs(struct sk_buff_head *free) +static void free_old_xmit_skbs(struct virtnet_info *vi) { struct sk_buff *skb; - while ((skb = __skb_dequeue(free)) != NULL) + unsigned int len; + + while ((skb = vi->vdev->ops->get_outbuf(vi->vdev, &len)) != NULL) { + pr_debug("Sent skb %p\n", skb); + __skb_unlink(skb, &vi->send); + vi->ndev->stats.tx_bytes += len; + vi->ndev->stats.tx_packets++; kfree_skb(skb); + } } static int start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -128,19 +162,16 @@ unsigned long num, id; struct scatterlist sg[MAX_SKB_FRAGS]; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; - unsigned long flags; pr_debug("%s: xmit %p %02x:%02x:%02x:%02x:%02x:%02x\n", dev->name, skb, dest[0], dest[1], dest[2], dest[3], dest[4], dest[5]); - spin_lock_irqsave(&vi->vdev->lock, flags); - /* Free any transmitted packets: not supposed to do it in interrupt */ - free_old_skbs(&vi->free); + free_old_xmit_skbs(vi); num = skb_to_sgvec(skb, sg, 0, skb->len); __skb_queue_head(&vi->send, skb); - id = vi->vdev->ops->add_outbuf(vi->vdev, sg, num, skb_xmit_done, skb); + id = vi->vdev->ops->add_outbuf(vi->vdev, sg, num, skb); if (IS_ERR_VALUE(id)) { pr_debug("%s: virtio not prepared to send\n", dev->name); skb_unlink(skb, &vi->send); @@ -148,8 +179,7 @@ return NETDEV_TX_BUSY; } SKB_ID(skb) = id; - vi->vdev->ops->sync(vi->vdev); - spin_unlock_irqrestore(&vi->vdev->lock, flags); + vi->vdev->ops->sync(vi->vdev, VIRTIO_OUT); return 0; } @@ -157,16 +187,12 @@ static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - int i, err; - spin_lock_irq(&vi->vdev->lock); - for (i = 0; (err = try_fill_recv(vi)) == 0; i++); - vi->vdev->ops->sync(vi->vdev); - spin_unlock_irq(&vi->vdev->lock); + try_fill_recv(vi); /* If we didn't even get one input buffer, we're useless. */ - if (i == 0) - return err; + if (vi->num == 0) + return -ENOMEM; return 0; } @@ -176,20 +202,26 @@ struct virtnet_info *vi = netdev_priv(dev); struct sk_buff *skb; - spin_lock_irq(&vi->vdev->lock); + /* networking core has neutered skb_xmit_done/skb_recv_done, so don't + * worry about races vs. get_buf(). */ while ((skb = __skb_dequeue(&vi->recv)) != NULL) { vi->vdev->ops->detach_inbuf(vi->vdev, SKB_ID(skb)); kfree_skb(skb); + vi->num--; } while ((skb = __skb_dequeue(&vi->send)) != NULL) { vi->vdev->ops->detach_outbuf(vi->vdev, SKB_ID(skb)); kfree_skb(skb); } - free_old_skbs(&vi->free); - spin_unlock_irq(&vi->vdev->lock); + BUG_ON(vi->num != 0); return 0; } +static struct virtio_driver_ops virtnet_ops = { + .in = skb_recv_done, + .out = skb_xmit_done, +}; + struct net_device *virtnet_probe(struct virtio_device *vdev, const u8 mac[ETH_ALEN]) { @@ -207,16 +239,18 @@ memcpy(dev->dev_addr, mac, ETH_ALEN); dev->open = virtnet_open; dev->stop = virtnet_close; + dev->poll = virtnet_poll; dev->hard_start_xmit = start_xmit; + dev->weight = 16; SET_NETDEV_DEV(dev, vdev->dev); vi = netdev_priv(dev); vi->vdev = vdev; vi->ndev = dev; vdev->priv = vi; + vdev->driver_ops = &virtnet_ops; skb_queue_head_init(&vi->recv); skb_queue_head_init(&vi->send); - skb_queue_head_init(&vi->free); err = register_netdev(dev); if (err) { diff -u b/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c --- b/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1,4 +1,4 @@ -//#define DEBUG +#define DEBUG #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/hdreg.h> @@ -8,6 +8,8 @@ static unsigned char virtblk_index = 'a'; struct virtio_blk { + spinlock_t lock; + struct virtio_device *vdev; /* The disk structure for the kernel. */ @@ -19,7 +21,7 @@ mempool_t *pool; /* Scatterlist: can be too big for stack. */ - struct scatterlist sg[1+MAX_PHYS_SEGMENTS]; + struct scatterlist sg[2+MAX_PHYS_SEGMENTS]; }; struct virtblk_req @@ -28,68 +30,94 @@ struct request *req; unsigned long out_id; bool out_done, in_done; - bool failed; + int uptodate; struct virtio_blk_outhdr out_hdr; struct virtio_blk_inhdr in_hdr; }; -/* Jens gave me this nice helper to end all chunks of a request. */ -static void end_dequeued_request(struct request *req, int uptodate) +static void end_dequeued_request(struct request *req, + request_queue_t *q, int uptodate) { - if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) + /* And so the insanity of the block layer infects us here. */ + int nsectors = req->hard_nr_sectors; + + if (blk_pc_request(req)) { + nsectors = (req->data_len + 511) >> 9; + if (!nsectors) + nsectors = 1; + printk("uptodate = %i\n", uptodate); + } + if (end_that_request_first(req, uptodate, nsectors)) BUG(); add_disk_randomness(req->rq_disk); end_that_request_last(req, uptodate); } -static void finish(struct virtio_blk *vblk, struct virtblk_req *vbr) +static bool finish(struct virtio_blk *vblk, struct virtblk_req *vbr) { - end_dequeued_request(vbr->req, !vbr->failed); + if (!vbr->in_done || !vbr->out_done) + return false; + end_dequeued_request(vbr->req, vblk->disk->queue, vbr->uptodate); list_del(&vbr->list); mempool_free(vbr, vblk->pool); - /* In case queue is stopped waiting for more buffers. */ - blk_start_queue(vblk->disk->queue); + return true; } /* We make sure they finished both the input and output buffers: otherwise * they might still have read access after we free them. */ -static void blk_out_done(struct virtio_device *vdev, void *_vbr, unsigned len) +static bool blk_out_done(struct virtio_device *vdev) { - struct virtblk_req *vbr = _vbr; struct virtio_blk *vblk = vdev->priv; + struct virtblk_req *vbr; + unsigned int len, finished = 0; + unsigned long flags; - assert_spin_locked(&vblk->vdev->lock); - - BUG_ON(vbr->out_done); - vbr->out_done = true; - if (vbr->in_done) - finish(vblk, vbr); + spin_lock_irqsave(&vblk->lock, flags); + while ((vbr = vdev->ops->get_outbuf(vdev, &len)) != NULL) { + BUG_ON(vbr->out_done); + vbr->out_done = true; + finished += finish(vblk, vbr); + } + /* In case queue is stopped waiting for more buffers. */ + if (finished) + blk_start_queue(vblk->disk->queue); + spin_unlock_irqrestore(&vblk->lock, flags); + return true; } -static void blk_in_done(struct virtio_device *vdev, void *_vbr, unsigned len) +static bool blk_in_done(struct virtio_device *vdev) { - struct virtblk_req *vbr = _vbr; struct virtio_blk *vblk = vdev->priv; - unsigned long expected_len; + struct virtblk_req *vbr; + unsigned int len, finished = 0; + unsigned long flags; + + spin_lock_irqsave(&vblk->lock, flags); - assert_spin_locked(&vblk->vdev->lock); + while ((vbr = vdev->ops->get_inbuf(vdev, &len)) != NULL) { + BUG_ON(vbr->in_done); - expected_len = sizeof(vbr->in_hdr); - if (vbr->out_hdr.type == READ) - expected_len += vbr->req->hard_nr_sectors*512; - - if (unlikely(len != expected_len)) { - dev_err(vblk->vdev->dev, "short reply %u not %lu", - len, expected_len); - vbr->failed = true; - } else if (unlikely(vbr->in_hdr.status != 1)) { - vbr->failed = true; + switch (vbr->in_hdr.status) { + case VIRTIO_BLK_S_OK: + vbr->uptodate = 1; + break; + case VIRTIO_BLK_S_UNSUPP: + printk("Request was unsupported\n"); + vbr->uptodate = -ENOTTY; + break; + default: + vbr->uptodate = 0; + break; + } + vbr->in_done = true; + finished += finish(vblk, vbr); } - BUG_ON(vbr->in_done); - vbr->in_done = true; - if (vbr->out_done) - finish(vblk, vbr); + /* In case queue is stopped waiting for more buffers. */ + if (finished) + blk_start_queue(vblk->disk->queue); + spin_unlock_irqrestore(&vblk->lock, flags); + return true; } static bool do_write(request_queue_t *q, struct virtio_blk *vblk, @@ -97,12 +125,14 @@ { unsigned long num; + vbr->out_hdr.type |= VIRTIO_BLK_T_WRITE; + /* Set up for reply. */ vblk->sg[0].page = virt_to_page(&vbr->in_hdr); vblk->sg[0].offset = offset_in_page(&vbr->in_hdr); vblk->sg[0].length = sizeof(vbr->in_hdr); vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg, 1, - blk_in_done, vbr); + vbr); if (IS_ERR_VALUE(vbr->out_hdr.id)) goto full; @@ -112,15 +142,13 @@ vblk->sg[0].length = sizeof(vbr->out_hdr); num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); - vbr->out_done = vbr->in_done = false; - vbr->failed = false; vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 1+num, - blk_out_done, vbr); + vbr); if (IS_ERR_VALUE(vbr->out_id)) goto detach_inbuf_full; pr_debug("Write: %p in=%lu out=%lu\n", vbr, - vbr->out_hdr.id, vbr->out_id); + (long)vbr->out_hdr.id, (long)vbr->out_id); list_add_tail(&vbr->list, &vblk->reqs); return true; @@ -135,13 +163,15 @@ { unsigned long num; + vbr->out_hdr.type |= VIRTIO_BLK_T_READ; + /* Set up for reply. */ vblk->sg[0].page = virt_to_page(&vbr->in_hdr); vblk->sg[0].offset = offset_in_page(&vbr->in_hdr); vblk->sg[0].length = sizeof(vbr->in_hdr); num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg, - 1+num, blk_in_done, vbr); + 1+num, vbr); if (IS_ERR_VALUE(vbr->out_hdr.id)) goto full; @@ -149,15 +179,53 @@ vblk->sg[0].offset = offset_in_page(&vbr->out_hdr); vblk->sg[0].length = sizeof(vbr->out_hdr); - vbr->out_done = vbr->in_done = false; - vbr->failed = false; vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 1, - blk_out_done, vbr); + vbr); if (IS_ERR_VALUE(vbr->out_id)) goto detach_inbuf_full; pr_debug("Read: %p in=%lu out=%lu\n", vbr, - vbr->out_hdr.id, vbr->out_id); + (long)vbr->out_hdr.id, (long)vbr->out_id); + list_add_tail(&vbr->list, &vblk->reqs); + return true; + +detach_inbuf_full: + vblk->vdev->ops->detach_inbuf(vblk->vdev, vbr->out_hdr.id); +full: + return false; +} + +static bool do_scsi(request_queue_t *q, struct virtio_blk *vblk, + struct virtblk_req *vbr) +{ + unsigned long num; + + vbr->out_hdr.type |= VIRTIO_BLK_T_SCSI_CMD; + + /* Set up for reply. */ + vblk->sg[0].page = virt_to_page(&vbr->in_hdr); + vblk->sg[0].offset = offset_in_page(&vbr->in_hdr); + vblk->sg[0].length = sizeof(vbr->in_hdr); + vbr->out_hdr.id = vblk->vdev->ops->add_inbuf(vblk->vdev, vblk->sg, 1, + vbr); + if (IS_ERR_VALUE(vbr->out_hdr.id)) + goto full; + + vblk->sg[0].page = virt_to_page(&vbr->out_hdr); + vblk->sg[0].offset = offset_in_page(&vbr->out_hdr); + vblk->sg[0].length = sizeof(vbr->out_hdr); + vblk->sg[1].page = virt_to_page(vbr->req->cmd); + vblk->sg[1].offset = offset_in_page(vbr->req->cmd); + vblk->sg[1].length = vbr->req->cmd_len; + + num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); + vbr->out_id = vblk->vdev->ops->add_outbuf(vblk->vdev, vblk->sg, 2+num, + vbr); + if (IS_ERR_VALUE(vbr->out_id)) + goto detach_inbuf_full; + + pr_debug("Scsi: %p in=%lu out=%lu\n", vbr, + (long)vbr->out_hdr.id, (long)vbr->out_id); list_add_tail(&vbr->list, &vblk->reqs); return true; @@ -176,37 +244,38 @@ while ((req = elv_next_request(q)) != NULL) { vblk = req->rq_disk->private_data; - /* FIXME: handle these iff capable. */ - if (!blk_fs_request(req)) { - pr_debug("Got non-command 0x%08x\n", req->cmd_type); - req->errors++; - blkdev_dequeue_request(req); - end_dequeued_request(req, 0); - continue; - } - vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); if (!vbr) goto stop; BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg)); vbr->req = req; - vbr->out_hdr.type = rq_data_dir(req); + /* Actual type gets or'ed in do_scsi/do_write/do_read */ + vbr->out_hdr.type = blk_barrier_rq(req)?VIRTIO_BLK_T_BARRIER:0; vbr->out_hdr.sector = req->sector; + vbr->out_hdr.ioprio = req->ioprio; + vbr->out_done = vbr->in_done = false; - if (rq_data_dir(req) == WRITE) { - if (!do_write(q, vblk, vbr)) - goto stop; - } else { - if (!do_read(q, vblk, vbr)) + if (blk_pc_request(req)) { + if (!do_scsi(q, vblk, vbr)) goto stop; - } + } else if (blk_fs_request(req)) { + if (rq_data_dir(req) == WRITE) { + if (!do_write(q, vblk, vbr)) + goto stop; + } else { + if (!do_read(q, vblk, vbr)) + goto stop; + } + } else + /* We don't put anything else in the queue. */ + BUG(); blkdev_dequeue_request(req); } sync: if (vblk) - vblk->vdev->ops->sync(vblk->vdev); + vblk->vdev->ops->sync(vblk->vdev, VIRTIO_IN|VIRTIO_OUT); return; stop: @@ -216,7 +285,21 @@ goto sync; } +static int virtblk_ioctl(struct inode *inode, struct file *filp, + unsigned cmd, unsigned long data) +{ + return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk, cmd, + (void __user *)data); +} + +static struct virtio_driver_ops virtblk_ops = { + .in = blk_in_done, + .out = blk_out_done, +}; + + static struct block_device_operations virtblk_fops = { + .ioctl = virtblk_ioctl, .owner = THIS_MODULE, }; @@ -232,8 +315,10 @@ } INIT_LIST_HEAD(&vblk->reqs); + spin_lock_init(&vblk->lock); vblk->vdev = vdev; vdev->priv = vblk; + vdev->driver_ops = &virtblk_ops; vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); if (!vblk->pool) { @@ -254,19 +339,20 @@ goto out_unregister_blkdev; } - vblk->disk->queue = blk_init_queue(do_virtblk_request, - &vblk->vdev->lock); + vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); if (!vblk->disk->queue) { err = -ENOMEM; goto out_put_disk; } - sprintf(vblk->disk->disk_name, "vb%c", virtblk_index++); + sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); vblk->disk->major = major; vblk->disk->first_minor = 0; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; + blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); + /* Caller can do blk_queue_max_hw_segments(), set_capacity() * etc then add_disk(). */ return vblk->disk; diff -u b/include/linux/virtio_blk.h b/include/linux/virtio_blk.h --- b/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -3,26 +3,37 @@ #include <linux/types.h> -struct gendisk; -struct virtio_device; -struct hd_geometry; + +#define VIRTIO_BLK_T_READ 0 +#define VIRTIO_BLK_T_WRITE 1 +#define VIRTIO_BLK_T_SCSI_CMD 3 +#define VIRTIO_BLK_T_BARRIER 0x80000000 /* Barrier before this op. */ /* This is the first element of the scatter-gather list. */ struct virtio_blk_outhdr { - /* 0 == read, 1 == write */ - u32 type; + /* VIRTIO_BLK_T* */ + __u32 type; + /* io priority. */ + __u32 ioprio; /* Sector (ie. 512 byte offset) */ - unsigned long sector; + __u64 sector; /* Where to put reply. */ - unsigned long id; + __u64 id; }; +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + struct virtio_blk_inhdr { - /* 1 = OK, 0 = not ok. */ - unsigned long status; + unsigned char status; }; +#ifdef __KERNEL__ +struct gendisk; +struct virtio_device; + struct gendisk *virtblk_probe(struct virtio_device *vdev); void virtblk_remove(struct gendisk *disk); - +#endif /* __KERNEL__ */ #endif /* _LINUX_VIRTIO_BLK_H */ only in patch2: unchanged: --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -341,6 +341,7 @@ unifdef-y += utsname.h unifdef-y += utsname.h unifdef-y += videodev2.h unifdef-y += videodev.h +unifdef-y += virtio_blk.h unifdef-y += wait.h unifdef-y += wanrouter.h unifdef-y += watchdog.h _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/virtualization