Re: [PATCH v8 3/4] block: add block timer and throttling algorithm

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Sep 24, 2011 at 12:19 AM, Kevin Wolf <kwolf@xxxxxxxxxx> wrote:
> Am 08.09.2011 12:11, schrieb Zhi Yong Wu:
>> Note:
>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>
>> For these problems, if you have nice thought, pls let us know.:)
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx>
>> ---
>>  block.c |  259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  block.h |    1 -
>>  2 files changed, 248 insertions(+), 12 deletions(-)
>
> One general comment: What about synchronous and/or coroutine I/O
> operations? Do you think they are just not important enough to consider
> here or were they forgotten?
For sync ops, we assume that it will be converse into async mode at
some point of future, right?
For coroutine I/O, it is introduced in image driver layer, and behind
bdrv_aio_readv/writev. I think that we need not consider them, right?

>
> Also, do I understand correctly that you're always submitting the whole
Right, when the block timer fire, it will flush whole request queue.
> queue at once? Does this effectively enforce the limit all the time or
> will it lead to some peaks and then no requests at all for a while until
In fact, it only try to submit those enqueued request one by one. If
fail to pass the limit, this request will be enqueued again.
> the average is right again?
Yeah, it is possible. Do you better idea?
>
> Maybe some documentation on how it all works from a high level
> perspective would be helpful.
>
>> diff --git a/block.c b/block.c
>> index cd75183..c08fde8 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -30,6 +30,9 @@
>>  #include "qemu-objects.h"
>>  #include "qemu-coroutine.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
>>                                           QEMUIOVector *iov);
>>  static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, int64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
>>          if (bs->change_cb)
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +        bs->block_queue = NULL;
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +        bs->block_timer = NULL;
>> +    }
>
> Why not io_limits_disable() instead of copying the code here?
Good point, thanks.
>
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -2341,16 +2368,48 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>                                   BlockDriverCompletionFunc *cb, void *opaque)
>>  {
>>      BlockDriver *drv = bs->drv;
>> -
>> +    BlockDriverAIOCB *ret;
>> +    int64_t wait_time = -1;
>> +printf("sector_num=%ld, nb_sectors=%d\n", sector_num, nb_sectors);
>
> Debugging leftover (more of them follow, won't comment on each one)
Removed.
>
>>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>          return NULL;
>> +    }
>
> This part is unrelated.
Have changed it to original.
>
>> +
>> +    /* throttling disk read I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>> +                           sector_num, qiov, nb_sectors, cb, opaque);
>> +            printf("wait_time=%ld\n", wait_time);
>> +            if (wait_time != -1) {
>> +                printf("reset block timer\n");
>> +                qemu_mod_timer(bs->block_timer,
>> +                               wait_time + qemu_get_clock_ns(vm_clock));
>> +            }
>> +
>> +            if (ret) {
>> +                printf("ori ret is not null\n");
>> +            } else {
>> +                printf("ori ret is null\n");
>> +            }
>> +
>> +            return ret;
>> +        }
>> +    }
>>
>> -    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>> +    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>>                                 cb, opaque);
>> +    if (ret) {
>> +        if (bs->io_limits_enabled) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
>> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
>> +        }
>
> I wonder if you can't reuse bs->nr_bytes/nr_ops instead of introducing a
> second counting mechanism. Would have the advantage that numbers are
NO, our counting variables will be reset to ZERO if current slice
time(0.1ms) is used up.
> actually consistent (your metric counts slightly differently than the
> existing info blockstats one).
Yeah, i notice this, and don't think there's wrong with it. and you?
>
>> +    }
>> +
>> +    return ret;
>>  }
>>
>>  typedef struct BlockCompleteData {
>> @@ -2396,15 +2455,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>      BlockDriver *drv = bs->drv;
>>      BlockDriverAIOCB *ret;
>>      BlockCompleteData *blk_cb_data;
>> +    int64_t wait_time = -1;
>>
>>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bs->read_only)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bs->read_only
>> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>          return NULL;
>> +    }
>
> Again, unrelated changes.
Have changed it to original.
>
>>
>>      if (bs->dirty_bitmap) {
>>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
>> @@ -2413,13 +2471,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>          opaque = blk_cb_data;
>>      }
>>
>> +    /* throttling disk write I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
>> +                                  sector_num, qiov, nb_sectors, cb, opaque);
>> +            if (wait_time != -1) {
>> +                qemu_mod_timer(bs->block_timer,
>> +                               wait_time + qemu_get_clock_ns(vm_clock));
>> +            }
>> +
>> +            return ret;
>> +        }
>> +    }
>
> This looks very similar to the code in bdrv_aio_readv. Can it be moved
> into a common function?
Good advice, done. thanks.

>
> Kevin
>



-- 
Regards,

Zhi Yong Wu
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux