Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.

Yoshiaki Tamura <tamura.yoshiaki@xxxxxxxxxxxxx> · Wed, 19 Jan 2011 22:04:39 +0900

2011/1/19 Kevin Wolf <kwolf@xxxxxxxxxx>:
> Am 19.01.2011 06:44, schrieb Yoshiaki Tamura:
>> event-tap controls when to start FT transaction, and provides proxy
>> functions to called from net/block devices.  While FT transaction, it
>> queues up net/block requests, and flush them when the transaction gets
>> completed.
>>
>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@xxxxxxxxxxxxx>
>> Signed-off-by: OHMURA Kei <ohmura.kei@xxxxxxxxxxxxx>
>
> One general comment: On the first glance this seems to mix block and net
> (and some other things) arbitrarily instead of having a section for
> handling all block stuff, then network, etc.
>
> Is there a specific reason for the order in which you put the functions?
> If not, maybe reordering them might improve readability.

Thanks.  I'll rework on that.

>
>> ---
>>  Makefile.target |    1 +
>>  event-tap.c     |  847 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  event-tap.h     |   42 +++
>>  qemu-tool.c     |   24 ++
>>  trace-events    |    9 +
>>  5 files changed, 923 insertions(+), 0 deletions(-)
>>  create mode 100644 event-tap.c
>>  create mode 100644 event-tap.h
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index e15b1c4..f36cd75 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -199,6 +199,7 @@ obj-y += rwhandler.o
>>  obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
>>  LIBS+=-lz
>> +obj-y += event-tap.o
>>
>>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
>> diff --git a/event-tap.c b/event-tap.c
>> new file mode 100644
>> index 0000000..f492708
>> --- /dev/null
>> +++ b/event-tap.c
>
>> @@ -0,0 +1,847 @@
>> +/*
>> + * Event Tap functions for QEMU
>> + *
>> + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "qemu-error.h"
>> +#include "block.h"
>> +#include "block_int.h"
>> +#include "ioport.h"
>> +#include "osdep.h"
>> +#include "sysemu.h"
>> +#include "hw/hw.h"
>> +#include "net.h"
>> +#include "event-tap.h"
>> +#include "trace.h"
>> +
>> +enum EVENT_TAP_STATE {
>> +    EVENT_TAP_OFF,
>> +    EVENT_TAP_ON,
>> +    EVENT_TAP_FLUSH,
>> +    EVENT_TAP_LOAD,
>> +    EVENT_TAP_REPLAY,
>> +};
>> +
>> +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF;
>> +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */
>
> Indeed, bdrv_aio_cancel will segfault this way.
>
> If you use dummies instead of real ACBs the only way to correctly
> implement bdrv_aio_cancel is waiting for all in-flight AIOs
> (qemu_aio_flush).

So I need to insert a new event_tap function to bdrv_aio_cancel
to do that.

>
>> +typedef struct EventTapIOport {
>> +    uint32_t address;
>> +    uint32_t data;
>> +    int      index;
>> +} EventTapIOport;
>> +
>> +#define MMIO_BUF_SIZE 8
>> +
>> +typedef struct EventTapMMIO {
>> +    uint64_t address;
>> +    uint8_t  buf[MMIO_BUF_SIZE];
>> +    int      len;
>> +} EventTapMMIO;
>> +
>> +typedef struct EventTapNetReq {
>> +    char *device_name;
>> +    int iovcnt;
>> +    struct iovec *iov;
>> +    int vlan_id;
>> +    bool vlan_needed;
>> +    bool async;
>> +    NetPacketSent *sent_cb;
>> +} EventTapNetReq;
>> +
>> +#define MAX_BLOCK_REQUEST 32
>> +
>> +typedef struct EventTapBlkReq {
>> +    char *device_name;
>> +    int num_reqs;
>> +    int num_cbs;
>> +    bool is_flush;
>> +    BlockRequest reqs[MAX_BLOCK_REQUEST];
>> +    BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST];
>> +    void *opaque[MAX_BLOCK_REQUEST];
>> +} EventTapBlkReq;
>> +
>> +#define EVENT_TAP_IOPORT (1 << 0)
>> +#define EVENT_TAP_MMIO   (1 << 1)
>> +#define EVENT_TAP_NET    (1 << 2)
>> +#define EVENT_TAP_BLK    (1 << 3)
>> +
>> +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1)
>> +
>> +typedef struct EventTapLog {
>> +    int mode;
>> +    union {
>> +        EventTapIOport ioport;
>> +        EventTapMMIO mmio;
>> +    };
>> +    union {
>> +        EventTapNetReq net_req;
>> +        EventTapBlkReq blk_req;
>> +    };
>> +    QTAILQ_ENTRY(EventTapLog) node;
>> +} EventTapLog;
>> +
>> +static EventTapLog *last_event_tap;
>> +
>> +static QTAILQ_HEAD(, EventTapLog) event_list;
>> +static QTAILQ_HEAD(, EventTapLog) event_pool;
>> +
>> +static int (*event_tap_cb)(void);
>> +static QEMUBH *event_tap_bh;
>> +static VMChangeStateEntry *vmstate;
>> +
>> +static void event_tap_bh_cb(void *p)
>> +{
>> +    if (event_tap_cb) {
>> +        event_tap_cb();
>> +    }
>> +
>> +    qemu_bh_delete(event_tap_bh);
>> +    event_tap_bh = NULL;
>> +}
>> +
>> +static void event_tap_schedule_bh(void)
>> +{
>> +    trace_event_tap_ignore_bh(!!event_tap_bh);
>> +
>> +    /* if bh is already set, we ignore it for now */
>> +    if (event_tap_bh) {
>> +        return;
>> +    }
>> +
>> +    event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL);
>> +    qemu_bh_schedule(event_tap_bh);
>> +
>> +    return ;
>> +}
>> +
>> +static void event_tap_alloc_net_req(EventTapNetReq *net_req,
>> +                                   VLANClientState *vc,
>> +                                   const struct iovec *iov, int iovcnt,
>> +                                   NetPacketSent *sent_cb, bool async)
>> +{
>> +    int i;
>> +
>> +    net_req->iovcnt = iovcnt;
>> +    net_req->async = async;
>> +    net_req->device_name = qemu_strdup(vc->name);
>> +    net_req->sent_cb = sent_cb;
>> +
>> +    if (vc->vlan) {
>> +        net_req->vlan_needed = 1;
>> +        net_req->vlan_id = vc->vlan->id;
>> +    } else {
>> +        net_req->vlan_needed = 0;
>> +    }
>> +
>> +    if (async) {
>> +        net_req->iov = (struct iovec *)iov;
>> +    } else {
>> +        net_req->iov = qemu_malloc(sizeof(struct iovec) * iovcnt);
>> +        for (i = 0; i < iovcnt; i++) {
>> +            net_req->iov[i].iov_base = qemu_malloc(iov[i].iov_len);
>> +            memcpy(net_req->iov[i].iov_base, iov[i].iov_base, iov[i].iov_len);
>> +            net_req->iov[i].iov_len = iov[i].iov_len;
>> +        }
>> +    }
>> +}
>> +
>> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
>> +                                    BlockDriverState *bs, BlockRequest *reqs,
>> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
>> +                                    void *opaque, bool is_flush)
>> +{
>> +    int i;
>> +
>> +    blk_req->num_reqs = num_reqs;
>> +    blk_req->num_cbs = num_reqs;
>> +    blk_req->device_name = qemu_strdup(bs->device_name);
>> +    blk_req->is_flush = is_flush;
>> +
>> +    for (i = 0; i < num_reqs; i++) {
>> +        blk_req->reqs[i].sector = reqs[i].sector;
>> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
>> +        blk_req->reqs[i].qiov = reqs[i].qiov;
>> +        blk_req->reqs[i].cb = cb;
>> +        blk_req->reqs[i].opaque = opaque;
>> +        blk_req->cb[i] = reqs[i].cb;
>> +        blk_req->opaque[i] = reqs[i].opaque;
>> +    }
>> +}
>> +
>> +static void *event_tap_alloc_log(void)
>> +{
>> +    EventTapLog *log;
>> +
>> +    if (QTAILQ_EMPTY(&event_pool)) {
>> +        log = qemu_mallocz(sizeof(EventTapLog));
>> +    } else {
>> +        log = QTAILQ_FIRST(&event_pool);
>> +        QTAILQ_REMOVE(&event_pool, log, node);
>> +    }
>> +
>> +    return log;
>> +}
>> +
>> +static void event_tap_free_log(EventTapLog *log)
>> +{
>> +    int i, mode = log->mode & ~EVENT_TAP_TYPE_MASK;
>> +
>> +    if (mode == EVENT_TAP_NET) {
>> +        EventTapNetReq *net_req = &log->net_req;
>> +
>> +        if (!net_req->async) {
>> +            for (i = 0; i < net_req->iovcnt; i++) {
>> +                qemu_free(net_req->iov[i].iov_base);
>> +            }
>> +            qemu_free(net_req->iov);
>> +        } else if (event_tap_state >= EVENT_TAP_LOAD) {
>> +            qemu_free(net_req->iov);
>> +        }
>> +
>> +        qemu_free(net_req->device_name);
>> +    } else if (mode == EVENT_TAP_BLK) {
>> +        EventTapBlkReq *blk_req = &log->blk_req;
>> +
>> +        if (event_tap_state >= EVENT_TAP_LOAD && !blk_req->is_flush) {
>> +            for (i = 0; i < blk_req->num_reqs; i++) {
>> +                qemu_iovec_destroy(blk_req->reqs[i].qiov);
>> +                qemu_free(blk_req->reqs[i].qiov);
>> +            }
>> +        }
>> +
>> +        qemu_free(blk_req->device_name);
>> +    }
>> +
>> +    log->mode = 0;
>> +
>> +    /* return the log to event_pool */
>> +    QTAILQ_INSERT_HEAD(&event_pool, log, node);
>> +}
>> +
>> +static void event_tap_free_pool(void)
>> +{
>> +    EventTapLog *log, *next;
>> +
>> +    QTAILQ_FOREACH_SAFE(log, &event_pool, node, next) {
>> +        QTAILQ_REMOVE(&event_pool, log, node);
>> +        qemu_free(log);
>> +    }
>> +}
>> +
>> +static void event_tap_blk_cb(void *opaque, int ret)
>> +{
>> +    EventTapLog *log = container_of(opaque, EventTapLog, blk_req);
>> +    EventTapBlkReq *blk_req = opaque;
>> +    int i;
>> +
>> +    blk_req->num_cbs--;
>> +
>> +    /* all outstanding requests are flushed */
>> +    if (blk_req->num_cbs == 0) {
>> +        for (i = 0; i < blk_req->num_reqs; i++) {
>> +            blk_req->cb[i](blk_req->opaque[i], ret);
>> +        }
>> +
>> +        event_tap_free_log(log);
>> +    }
>> +}
>> +
>> +static void event_tap_packet(VLANClientState *vc, const struct iovec *iov,
>> +                            int iovcnt, NetPacketSent *sent_cb, bool async)
>> +{
>> +    int empty;
>> +    EventTapLog *log = last_event_tap;
>> +
>> +    if (!log) {
>> +        trace_event_tap_no_event();
>> +        log = event_tap_alloc_log();
>> +    }
>> +
>> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
>> +        return;
>> +    }
>> +
>> +    log->mode |= EVENT_TAP_NET;
>> +    event_tap_alloc_net_req(&log->net_req, vc, iov, iovcnt, sent_cb, async);
>> +
>> +    empty = QTAILQ_EMPTY(&event_list);
>> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
>> +    last_event_tap = NULL;
>> +
>> +    if (empty) {
>> +        event_tap_schedule_bh();
>> +    }
>> +}
>> +
>> +static void event_tap_bdrv(BlockDriverState *bs, BlockRequest *reqs,
>> +                           int num_reqs, bool is_flush)
>> +{
>> +    EventTapLog *log = last_event_tap;
>> +    int empty;
>> +
>> +    if (!log) {
>> +        trace_event_tap_no_event();
>> +        log = event_tap_alloc_log();
>> +    }
>> +
>> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
>> +        return;
>> +    }
>> +
>> +    log->mode |= EVENT_TAP_BLK;
>> +    event_tap_alloc_blk_req(&log->blk_req, bs, reqs, num_reqs,
>> +                            event_tap_blk_cb, &log->blk_req, is_flush);
>> +
>> +    empty = QTAILQ_EMPTY(&event_list);
>> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
>> +    last_event_tap = NULL;
>> +
>> +    if (empty) {
>> +        event_tap_schedule_bh();
>> +    }
>> +}
>> +
>> +BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
>> +                                            int64_t sector_num,
>> +                                            QEMUIOVector *iov,
>> +                                            int nb_sectors,
>> +                                            BlockDriverCompletionFunc *cb,
>> +                                            void *opaque)
>> +{
>> +    BlockRequest req;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    req.sector = sector_num;
>> +    req.nb_sectors = nb_sectors;
>> +    req.qiov = iov;
>> +    req.cb = cb;
>> +    req.opaque = opaque;
>> +    event_tap_bdrv(bs, &req, 1, 0);
>> +
>> +    /* return a dummy_acb pointer to prevent from failing */
>> +    return &dummy_acb;
>> +}
>> +
>> +BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
>> +                                           BlockDriverCompletionFunc *cb,
>> +                                           void *opaque)
>> +{
>> +    BlockRequest req;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    memset(&req, 0, sizeof(req));
>> +    req.cb = cb;
>> +    req.opaque = opaque;
>> +    event_tap_bdrv(bs, &req, 1, 1);
>> +
>> +    return &dummy_acb;
>> +}
>> +
>> +void event_tap_send_packet(VLANClientState *vc, const uint8_t *buf, int size)
>> +{
>> +    struct iovec iov;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    iov.iov_base = (uint8_t *)buf;
>> +    iov.iov_len = size;
>> +    event_tap_packet(vc, &iov, 1, NULL, 0);
>> +
>> +    return;
>> +}
>> +ssize_t event_tap_sendv_packet_async(VLANClientState *vc,
>> +                                     const struct iovec *iov,
>> +                                     int iovcnt, NetPacketSent *sent_cb)
>> +{
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +    event_tap_packet(vc, iov, iovcnt, sent_cb, 1);
>> +    return 0;
>> +}
>> +
>> +int event_tap_register(int (*cb)(void))
>> +{
>> +    if (event_tap_state != EVENT_TAP_OFF) {
>> +        error_report("event-tap is already on");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (!cb || event_tap_cb) {
>> +        error_report("can't set event_tap_cb");
>> +        return -EINVAL;
>> +    }
>> +
>> +    event_tap_cb = cb;
>> +    event_tap_state = EVENT_TAP_ON;
>> +
>> +    return 0;
>> +}
>> +
>> +void event_tap_unregister(void)
>> +{
>> +    if (event_tap_state == EVENT_TAP_OFF) {
>> +        error_report("event-tap is already off");
>> +        return;
>> +    }
>> +
>> +    event_tap_state = EVENT_TAP_OFF;
>> +    event_tap_cb = NULL;
>> +
>> +    event_tap_flush();
>> +    event_tap_free_pool();
>> +}
>> +
>> +int event_tap_is_on(void)
>> +{
>> +    return (event_tap_state == EVENT_TAP_ON);
>> +}
>> +
>> +void event_tap_ioport(int index, uint32_t address, uint32_t data)
>> +{
>> +    if (event_tap_state != EVENT_TAP_ON) {
>> +        return;
>> +    }
>> +
>> +    if (!last_event_tap) {
>> +        last_event_tap = event_tap_alloc_log();
>> +    }
>> +
>> +    last_event_tap->mode = EVENT_TAP_IOPORT;
>> +    last_event_tap->ioport.index = index;
>> +    last_event_tap->ioport.address = address;
>> +    last_event_tap->ioport.data = data;
>> +}
>> +
>> +void event_tap_mmio(uint64_t address, uint8_t *buf, int len)
>> +{
>> +    if (event_tap_state != EVENT_TAP_ON || len > MMIO_BUF_SIZE) {
>> +        return;
>> +    }
>> +
>> +    if (!last_event_tap) {
>> +        last_event_tap = event_tap_alloc_log();
>> +    }
>> +
>> +    last_event_tap->mode = EVENT_TAP_MMIO;
>> +    last_event_tap->mmio.address = address;
>> +    last_event_tap->mmio.len = len;
>> +    memcpy(last_event_tap->mmio.buf, buf, len);
>> +}
>> +
>> +static void event_tap_net_flush(EventTapNetReq *net_req)
>> +{
>> +    VLANClientState *vc;
>> +    ssize_t len;
>> +
>> +    if (net_req->vlan_needed) {
>> +        vc = qemu_find_vlan_client_by_name(NULL, net_req->vlan_id,
>> +                                           net_req->device_name);
>> +    } else {
>> +        vc = qemu_find_netdev(net_req->device_name);
>> +    }
>> +
>> +    if (net_req->async) {
>> +        len = qemu_sendv_packet_async(vc, net_req->iov, net_req->iovcnt,
>> +                                      net_req->sent_cb);
>> +        if (len) {
>> +            net_req->sent_cb(vc, len);
>> +        } else {
>> +            /* packets are queued in the net layer */
>> +            trace_event_tap_append_packet();
>> +        }
>> +    } else {
>> +        qemu_send_packet(vc, net_req->iov[0].iov_base,
>> +                         net_req->iov[0].iov_len);
>> +    }
>> +}
>> +
>> +static void event_tap_blk_flush(EventTapBlkReq *blk_req)
>> +{
>> +    BlockDriverState *bs;
>> +
>> +    bs = bdrv_find(blk_req->device_name);
>
> Please store the BlockDriverState in blk_req. This code loops over all
> block devices and does a string comparison - and that for each request.
> You can also save the qemu_strdup() when creating the request.
>
> In the few places where you really need the device name (might be the
> case for load/save, I'm not sure), you can still get it from the
> BlockDriverState.

I would do so for the primary side.  Although we haven't
implemented yet, we want to replay block requests from block
layer on the secondary side, and need device name to restore
BlockDriverState.

>
>> +
>> +    if (blk_req->is_flush) {
>> +        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);
>
> You need to handle errors. If bdrv_aio_flush returns NULL, call the
> callback with -EIO.

I'll do so.

>
>> +        return;
>> +    }
>> +
>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>> +                    blk_req->reqs[0].opaque);
>
> Same here.
>
>> +    bdrv_flush(bs);
>
> This looks really strange. What is this supposed to do?
>
> One point is that you write it immediately after bdrv_aio_write, so you
> get an fsync for which you don't know if it includes the current write
> request or if it doesn't. Which data do you want to get flushed to the disk?

I was expecting to flush the aio request that was just initiated.
Am I misunderstanding the function?

> The other thing is that you introduce a bdrv_flush for each request,
> basically forcing everyone to something very similar to writethrough
> mode. I'm sure this will have a big impact on performance.

The reason is to avoid inversion of queued requests.  Although
processing one-by-one is heavy, wouldn't having requests flushed
to disk out of order break the disk image?

> Additionally, error handling is missing.

I looked at the codes using bdrv_flush and realized some of them
doesn't handle errors, but scsi-disk.c does.  Should everyone
handle errors or depends on the usage?

>
> Kevin
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html