On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@xxxxxxxxxx> wrote: > On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote: >> Welcome to give me your comments, thanks. >> >> Signed-off-by: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx> >> --- >> Makefile.objs | 2 +- >> block.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++++++-- >> block.h | 1 - >> block/blk-queue.c | 116 +++++++++++++++++++++ >> block/blk-queue.h | 70 +++++++++++++ >> block_int.h | 28 +++++ >> blockdev.c | 21 ++++ >> qemu-config.c | 24 +++++ >> qemu-option.c | 17 +++ >> qemu-option.h | 1 + >> qemu-options.hx | 1 + >> 11 files changed, 559 insertions(+), 10 deletions(-) >> create mode 100644 block/blk-queue.c >> create mode 100644 block/blk-queue.h >> >> diff --git a/Makefile.objs b/Makefile.objs >> index 9f99ed4..06f2033 100644 >> --- a/Makefile.objs >> +++ b/Makefile.objs >> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv >> block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o >> block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o >> block-nested-y += qed-check.o >> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o >> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o >> block-nested-$(CONFIG_WIN32) += raw-win32.o >> block-nested-$(CONFIG_POSIX) += raw-posix.o >> block-nested-$(CONFIG_CURL) += curl.o >> diff --git a/block.c b/block.c >> index 24a25d5..e54e59c 100644 >> --- a/block.c >> +++ b/block.c >> @@ -29,6 +29,9 @@ >> #include "module.h" >> #include "qemu-objects.h" >> >> +#include "qemu-timer.h" >> +#include "block/blk-queue.h" >> + >> #ifdef CONFIG_BSD >> #include <sys/types.h> >> #include <sys/stat.h> >> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, >> static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, >> const uint8_t *buf, int nb_sectors); >> >> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, >> + bool is_write, double elapsed_time, uint64_t *wait); >> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, >> + double elapsed_time, uint64_t *wait); >> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, >> + bool is_write, uint64_t *wait); >> + >> static QTAILQ_HEAD(, BlockDriverState) bdrv_states = >> QTAILQ_HEAD_INITIALIZER(bdrv_states); >> >> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename) >> } >> #endif >> >> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits) >> +{ >> + if ((io_limits->bps[0] == 0) >> + && (io_limits->bps[1] == 0) >> + && (io_limits->bps[2] == 0) >> + && (io_limits->iops[0] == 0) >> + && (io_limits->iops[1] == 0) >> + && (io_limits->iops[2] == 0)) { >> + return 0; >> + } >> + >> + return 1; >> +} >> + >> /* check if the path starts with "<protocol>:" */ >> static int path_has_protocol(const char *path) >> { >> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size, >> } >> } >> >> +static void bdrv_block_timer(void *opaque) >> +{ >> + BlockDriverState *bs = opaque; >> + BlockQueue *queue = bs->block_queue; >> + >> + while (!QTAILQ_EMPTY(&queue->requests)) { >> + BlockIORequest *request; >> + int ret; >> + >> + request = QTAILQ_FIRST(&queue->requests); >> + QTAILQ_REMOVE(&queue->requests, request, entry); >> + >> + ret = qemu_block_queue_handler(request); >> + if (ret == 0) { >> + QTAILQ_INSERT_HEAD(&queue->requests, request, entry); >> + break; >> + } >> + >> + qemu_free(request); >> + } >> +} >> + >> void bdrv_register(BlockDriver *bdrv) >> { >> if (!bdrv->bdrv_aio_readv) { >> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, >> bs->change_cb(bs->change_opaque, CHANGE_MEDIA); >> } >> >> + /* throttling disk I/O limits */ >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->block_queue = qemu_new_block_queue(); >> + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); >> + >> + bs->slice_start[0] = qemu_get_clock_ns(vm_clock); >> + bs->slice_start[1] = qemu_get_clock_ns(vm_clock); >> + } >> + > > It should be possible to tune the limits on the flight, please introduce > QMP commands for that. Yeah, I am working on this. > >> return 0; >> >> unlink_and_fail: >> @@ -680,6 +735,16 @@ void bdrv_close(BlockDriverState *bs) >> if (bs->change_cb) >> bs->change_cb(bs->change_opaque, CHANGE_MEDIA); >> } >> + >> + /* throttling disk I/O limits */ >> + if (bs->block_queue) { >> + qemu_del_block_queue(bs->block_queue); >> + } >> + >> + if (bs->block_timer) { >> + qemu_del_timer(bs->block_timer); >> + qemu_free_timer(bs->block_timer); >> + } >> } >> >> void bdrv_close_all(void) >> @@ -1312,6 +1377,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, >> *psecs = bs->secs; >> } >> >> +/* throttling disk io limits */ >> +void bdrv_set_io_limits(BlockDriverState *bs, >> + BlockIOLimit *io_limits) >> +{ >> + memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); >> + bs->io_limits = *io_limits; >> +} >> + >> /* Recognize floppy formats */ >> typedef struct FDFormat { >> FDriveType drive; >> @@ -2111,6 +2184,155 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) >> return buf; >> } >> >> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, >> + bool is_write, double elapsed_time, uint64_t *wait) { >> + uint64_t bps_limit = 0; >> + double bytes_limit, bytes_disp, bytes_res; >> + double slice_time = 0.1, wait_time; >> + >> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { >> + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; >> + } else if (bs->io_limits.bps[is_write]) { >> + bps_limit = bs->io_limits.bps[is_write]; >> + } else { >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return false; >> + } >> + >> + bytes_limit = bps_limit * slice_time; >> + bytes_disp = bs->io_disps.bytes[is_write]; >> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { >> + bytes_disp += bs->io_disps.bytes[!is_write]; >> + } >> + >> + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > > Virtio can submit requests of 512 sectors or more... does not play > well with 1MB/sec limit. Yeah, thanks for you catch. I will fix this. > >> + if (bytes_disp + bytes_res <= bytes_limit) { >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return false; >> + } >> + >> + /* Calc approx time to dispatch */ >> + wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit; >> + if (!wait_time) { >> + wait_time = 1; >> + } >> + >> + wait_time = wait_time + (slice_time - elapsed_time); >> + if (wait) { >> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; >> + } > > The guest can keep submitting requests where "wait_time = 1" above, > and the timer will be rearmed continuously in the future. Can't you > simply arm the timer to the next slice start? _Some_ data must be > transfered by then, anyway (and nothing can be transfered earlier than > that). Sorry, i have got what you mean. Can you elaborate in more detail? > > Same for iops calculation below. > >> + >> + return true; >> +} >> + >> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, >> + double elapsed_time, uint64_t *wait) { >> + uint64_t iops_limit = 0; >> + double ios_limit, ios_disp; >> + double slice_time = 0.1, wait_time; >> + >> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { >> + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; >> + } else if (bs->io_limits.iops[is_write]) { >> + iops_limit = bs->io_limits.iops[is_write]; >> + } else { >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return false; >> + } >> + >> + ios_limit = iops_limit * slice_time; >> + ios_disp = bs->io_disps.ios[is_write]; >> + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { >> + ios_disp += bs->io_disps.ios[!is_write]; >> + } >> + >> + if (ios_disp + 1 <= ios_limit) { >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return false; >> + } >> + >> + /* Calc approx time to dispatch */ >> + wait_time = (ios_disp + 1) / iops_limit; >> + if (wait_time > elapsed_time) { >> + wait_time = wait_time - elapsed_time; >> + } else { >> + wait_time = 0; >> + } >> + >> + if (wait) { >> + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; >> + } >> + >> + return true; >> +} >> + >> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, >> + bool is_write, uint64_t *wait) { >> + int64_t real_time; >> + uint64_t bps_wait = 0, iops_wait = 0, max_wait; >> + double elapsed_time; >> + int bps_ret, iops_ret; >> + >> + real_time = qemu_get_clock_ns(vm_clock); >> + if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) { >> + bs->slice_start[is_write] = real_time; >> + >> + bs->io_disps.bytes[is_write] = 0; >> + bs->io_disps.bytes[!is_write] = 0; >> + >> + bs->io_disps.ios[is_write] = 0; >> + bs->io_disps.ios[!is_write] = 0; >> + } >> + >> + /* If a limit was exceeded, immediately queue this request */ >> + if ((bs->req_from_queue == false) >> + && !QTAILQ_EMPTY(&bs->block_queue->requests)) { >> + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] >> + || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write] >> + || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return true; >> + } >> + } >> + >> + elapsed_time = real_time - bs->slice_start[is_write]; >> + elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0); >> + >> + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, >> + is_write, elapsed_time, &bps_wait); >> + iops_ret = bdrv_exceed_iops_limits(bs, is_write, >> + elapsed_time, &iops_wait); >> + if (bps_ret || iops_ret) { >> + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; >> + if (wait) { >> + *wait = max_wait; >> + } >> + >> + return true; >> + } >> + >> + if (wait) { >> + *wait = 0; >> + } >> + >> + return false; >> +} >> >> /**************************************************************/ >> /* async I/Os */ >> @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, >> { >> BlockDriver *drv = bs->drv; >> BlockDriverAIOCB *ret; >> + uint64_t wait_time = 0; >> >> trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); >> >> - if (!drv) >> - return NULL; >> - if (bdrv_check_request(bs, sector_num, nb_sectors)) >> + if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->req_from_queue = false; >> + } >> return NULL; >> + } >> + >> + /* throttling disk read I/O */ >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) { >> + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv, >> + sector_num, qiov, nb_sectors, cb, opaque); >> + qemu_mod_timer(bs->block_timer, >> + wait_time + qemu_get_clock_ns(vm_clock)); >> + bs->req_from_queue = false; >> + return ret; >> + } >> + } >> >> ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, >> cb, opaque); >> @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, >> /* Update stats even though technically transfer has not happened. */ >> bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; >> bs->rd_ops ++; >> + >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] += >> + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; >> + bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; >> + } >> + } >> + >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->req_from_queue = false; >> } >> >> return ret; >> @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, >> BlockDriver *drv = bs->drv; >> BlockDriverAIOCB *ret; >> BlockCompleteData *blk_cb_data; >> + uint64_t wait_time = 0; >> >> trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); >> >> - if (!drv) >> - return NULL; >> - if (bs->read_only) >> - return NULL; >> - if (bdrv_check_request(bs, sector_num, nb_sectors)) >> + if (!drv || bs->read_only >> + || bdrv_check_request(bs, sector_num, nb_sectors)) { >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->req_from_queue = false; >> + } >> + >> return NULL; >> + } >> >> if (bs->dirty_bitmap) { >> blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, >> @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, >> opaque = blk_cb_data; >> } >> >> + /* throttling disk write I/O */ >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) { >> + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev, >> + sector_num, qiov, nb_sectors, cb, opaque); >> + qemu_mod_timer(bs->block_timer, >> + wait_time + qemu_get_clock_ns(vm_clock)); >> + bs->req_from_queue = false; >> + return ret; >> + } >> + } >> + >> ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, >> cb, opaque); >> >> @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, >> if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { >> bs->wr_highest_sector = sector_num + nb_sectors - 1; >> } >> + >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] += >> + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; >> + bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; >> + } >> + } >> + >> + if (bdrv_io_limits_enable(&bs->io_limits)) { >> + bs->req_from_queue = false; >> } >> >> return ret; >> diff --git a/block.h b/block.h >> index 859d1d9..f0dac62 100644 >> --- a/block.h >> +++ b/block.h >> @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, >> const char *backing_file, const char *backing_fmt); >> void bdrv_register(BlockDriver *bdrv); >> >> - >> typedef struct BdrvCheckResult { >> int corruptions; >> int leaks; >> diff --git a/block/blk-queue.c b/block/blk-queue.c >> new file mode 100644 >> index 0000000..09fcfe9 >> --- /dev/null >> +++ b/block/blk-queue.c >> @@ -0,0 +1,116 @@ >> +/* >> + * QEMU System Emulator queue definition for block layer >> + * >> + * Copyright (c) 2011 Zhi Yong Wu <zwu.kernel@xxxxxxxxx> >> + * >> + * Permission is hereby granted, free of charge, to any person obtaining a copy >> + * of this software and associated documentation files (the "Software"), to deal >> + * in the Software without restriction, including without limitation the rights >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell >> + * copies of the Software, and to permit persons to whom the Software is >> + * furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice shall be included in >> + * all copies or substantial portions of the Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN >> + * THE SOFTWARE. >> + */ >> + >> +#include "block_int.h" >> +#include "qemu-queue.h" >> +#include "block/blk-queue.h" >> + >> +/* The APIs for block request queue on qemu block layer. >> + */ >> + >> +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb) >> +{ >> + qemu_aio_release(acb); >> +} >> + >> +static AIOPool block_queue_pool = { >> + .aiocb_size = sizeof(struct BlockDriverAIOCB), >> + .cancel = qemu_block_queue_cancel, >> +}; >> + >> +static void qemu_block_queue_callback(void *opaque, int ret) >> +{ >> + BlockDriverAIOCB *acb = opaque; >> + >> + qemu_aio_release(acb); >> +} >> + >> +BlockQueue *qemu_new_block_queue(void) >> +{ >> + BlockQueue *queue; >> + >> + queue = qemu_mallocz(sizeof(BlockQueue)); >> + >> + QTAILQ_INIT(&queue->requests); >> + >> + return queue; >> +} >> + >> +void qemu_del_block_queue(BlockQueue *queue) >> +{ >> + BlockIORequest *request, *next; >> + >> + QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) { >> + QTAILQ_REMOVE(&queue->requests, request, entry); >> + qemu_free(request); >> + } >> + >> + qemu_free(queue); >> +} >> + >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, >> + BlockDriverState *bs, >> + BlockRequestHandler *handler, >> + int64_t sector_num, >> + QEMUIOVector *qiov, >> + int nb_sectors, >> + BlockDriverCompletionFunc *cb, >> + void *opaque) >> +{ >> + BlockIORequest *request; >> + BlockDriverAIOCB *acb; >> + >> + request = qemu_malloc(sizeof(BlockIORequest)); >> + request->bs = bs; >> + request->handler = handler; >> + request->sector_num = sector_num; >> + request->qiov = qiov; >> + request->nb_sectors = nb_sectors; >> + request->cb = cb; >> + request->opaque = opaque; >> + >> + QTAILQ_INSERT_TAIL(&queue->requests, request, entry); >> + >> + acb = qemu_aio_get(&block_queue_pool, bs, >> + qemu_block_queue_callback, opaque); >> + >> + return acb; >> +} >> + >> +int qemu_block_queue_handler(BlockIORequest *request) >> +{ >> + int ret; >> + BlockDriverAIOCB *res; >> + >> + /* indicate this req is from block queue */ >> + request->bs->req_from_queue = true; >> + >> + res = request->handler(request->bs, request->sector_num, >> + request->qiov, request->nb_sectors, >> + request->cb, request->opaque); >> + >> + ret = (res == NULL) ? 0 : 1; >> + >> + return ret; >> +} >> diff --git a/block/blk-queue.h b/block/blk-queue.h >> new file mode 100644 >> index 0000000..47f8a36 >> --- /dev/null >> +++ b/block/blk-queue.h >> @@ -0,0 +1,70 @@ >> +/* >> + * QEMU System Emulator queue declaration for block layer >> + * >> + * Copyright (c) 2011 Zhi Yong Wu <zwu.kernel@xxxxxxxxx> >> + * >> + * Permission is hereby granted, free of charge, to any person obtaining a copy >> + * of this software and associated documentation files (the "Software"), to deal >> + * in the Software without restriction, including without limitation the rights >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell >> + * copies of the Software, and to permit persons to whom the Software is >> + * furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice shall be included in >> + * all copies or substantial portions of the Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN >> + * THE SOFTWARE. >> + */ >> + >> +#ifndef QEMU_BLOCK_QUEUE_H >> +#define QEMU_BLOCK_QUEUE_H >> + >> +#include "block.h" >> +#include "qemu-queue.h" >> +#include "qemu-common.h" >> + >> +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs, >> + int64_t sector_num, QEMUIOVector *qiov, >> + int nb_sectors, BlockDriverCompletionFunc *cb, >> + void *opaque); >> + >> +struct BlockIORequest { >> + QTAILQ_ENTRY(BlockIORequest) entry; >> + BlockDriverState *bs; >> + BlockRequestHandler *handler; >> + int64_t sector_num; >> + QEMUIOVector *qiov; >> + int nb_sectors; >> + BlockDriverCompletionFunc *cb; >> + void *opaque; >> +}; >> + >> +typedef struct BlockIORequest BlockIORequest; >> + >> +struct BlockQueue { >> + QTAILQ_HEAD(requests, BlockIORequest) requests; >> +}; >> + >> +typedef struct BlockQueue BlockQueue; >> + >> +BlockQueue *qemu_new_block_queue(void); >> + >> +void qemu_del_block_queue(BlockQueue *queue); >> + >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, >> + BlockDriverState *bs, >> + BlockRequestHandler *handler, >> + int64_t sector_num, >> + QEMUIOVector *qiov, >> + int nb_sectors, >> + BlockDriverCompletionFunc *cb, >> + void *opaque); >> + >> +int qemu_block_queue_handler(BlockIORequest *request); >> +#endif /* QEMU_BLOCK_QUEUE_H */ >> diff --git a/block_int.h b/block_int.h >> index 1e265d2..1587171 100644 >> --- a/block_int.h >> +++ b/block_int.h >> @@ -27,10 +27,17 @@ >> #include "block.h" >> #include "qemu-option.h" >> #include "qemu-queue.h" >> +#include "block/blk-queue.h" >> >> #define BLOCK_FLAG_ENCRYPT 1 >> #define BLOCK_FLAG_COMPAT6 4 >> >> +#define BLOCK_IO_LIMIT_READ 0 >> +#define BLOCK_IO_LIMIT_WRITE 1 >> +#define BLOCK_IO_LIMIT_TOTAL 2 >> + >> +#define BLOCK_IO_SLICE_TIME 100000000 >> + >> #define BLOCK_OPT_SIZE "size" >> #define BLOCK_OPT_ENCRYPT "encryption" >> #define BLOCK_OPT_COMPAT6 "compat6" >> @@ -46,6 +53,16 @@ typedef struct AIOPool { >> BlockDriverAIOCB *free_aiocb; >> } AIOPool; >> >> +typedef struct BlockIOLimit { >> + uint64_t bps[3]; >> + uint64_t iops[3]; >> +} BlockIOLimit; >> + >> +typedef struct BlockIODisp { >> + uint64_t bytes[2]; >> + uint64_t ios[2]; >> +} BlockIODisp; >> + >> struct BlockDriver { >> const char *format_name; >> int instance_size; >> @@ -175,6 +192,14 @@ struct BlockDriverState { >> >> void *sync_aiocb; >> >> + /* the time for latest disk I/O */ >> + int64_t slice_start[2]; >> + BlockIOLimit io_limits; >> + BlockIODisp io_disps; >> + BlockQueue *block_queue; >> + QEMUTimer *block_timer; >> + bool req_from_queue; >> + >> /* I/O stats (display with "info blockstats"). */ >> uint64_t rd_bytes; >> uint64_t wr_bytes; >> @@ -222,6 +247,9 @@ void qemu_aio_release(void *p); >> >> void *qemu_blockalign(BlockDriverState *bs, size_t size); >> >> +void bdrv_set_io_limits(BlockDriverState *bs, >> + BlockIOLimit *io_limits); >> + >> #ifdef _WIN32 >> int is_windows_drive(const char *filename); >> #endif >> diff --git a/blockdev.c b/blockdev.c >> index c263663..45602f4 100644 >> --- a/blockdev.c >> +++ b/blockdev.c >> @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) >> int on_read_error, on_write_error; >> const char *devaddr; >> DriveInfo *dinfo; >> + BlockIOLimit io_limits; >> + bool iol_flag = false; >> + const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"}; >> int is_extboot = 0; >> int snapshot = 0; >> int ret; >> @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) >> return NULL; >> } >> >> + /* disk io limits */ >> + iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts); >> + if (iol_flag) { >> + memset(&io_limits, 0, sizeof(BlockIOLimit)); >> + >> + io_limits.bps[2] = qemu_opt_get_number(opts, "bps", 0); >> + io_limits.bps[0] = qemu_opt_get_number(opts, "bps_rd", 0); >> + io_limits.bps[1] = qemu_opt_get_number(opts, "bps_wr", 0); >> + io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0); >> + io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0); >> + io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0); >> + } >> + >> on_write_error = BLOCK_ERR_STOP_ENOSPC; >> if ((buf = qemu_opt_get(opts, "werror")) != NULL) { >> if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { >> @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) >> >> bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); >> >> + /* throttling disk io limits */ >> + if (iol_flag) { >> + bdrv_set_io_limits(dinfo->bdrv, &io_limits); >> + } >> + >> switch(type) { >> case IF_IDE: >> case IF_SCSI: >> diff --git a/qemu-config.c b/qemu-config.c >> index efa892c..9232bbb 100644 >> --- a/qemu-config.c >> +++ b/qemu-config.c >> @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = { >> .name = "boot", >> .type = QEMU_OPT_BOOL, >> .help = "make this a boot drive", >> + },{ >> + .name = "iops", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit total I/O operations per second", >> + },{ >> + .name = "iops_rd", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit read operations per second", >> + },{ >> + .name = "iops_wr", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit write operations per second", >> + },{ >> + .name = "bps", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit total bytes per second", >> + },{ >> + .name = "bps_rd", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit read bytes per second", >> + },{ >> + .name = "bps_wr", >> + .type = QEMU_OPT_NUMBER, >> + .help = "limit write bytes per second", >> }, >> { /* end of list */ } >> }, >> diff --git a/qemu-option.c b/qemu-option.c >> index 65db542..9fe234d 100644 >> --- a/qemu-option.c >> +++ b/qemu-option.c >> @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval) >> return opt->value.uint; >> } >> >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts) >> +{ >> + int i; >> + uint64_t opt_val = 0; >> + bool iol_flag = false; >> + >> + for (i = 0; iol_opts[i]; i++) { >> + opt_val = qemu_opt_get_number(opts, iol_opts[i], 0); >> + if (opt_val != 0) { >> + iol_flag = true; >> + break; >> + } >> + } >> + >> + return iol_flag; >> +} >> + >> uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval) >> { >> QemuOpt *opt = qemu_opt_find(opts, name); >> diff --git a/qemu-option.h b/qemu-option.h >> index b515813..fc909f9 100644 >> --- a/qemu-option.h >> +++ b/qemu-option.h >> @@ -107,6 +107,7 @@ struct QemuOptsList { >> const char *qemu_opt_get(QemuOpts *opts, const char *name); >> int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval); >> uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval); >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts); >> uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval); >> int qemu_opt_set(QemuOpts *opts, const char *name, const char *value); >> typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque); >> diff --git a/qemu-options.hx b/qemu-options.hx >> index cb3347e..ae219f5 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, >> " [,cache=writethrough|writeback|none|unsafe][,format=f]\n" >> " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" >> " [,readonly=on|off][,boot=on|off]\n" >> + " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n" >> " use 'file' as a drive image\n", QEMU_ARCH_ALL) >> STEXI >> @item -drive @var{option}[,@var{option}[,@var{option}[,...]]] >> -- >> 1.7.2.3 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majordomo@xxxxxxxxxxxxxxx >> More majordomo info at http://vger.kernel.org/majordomo-info.html > -- Regards, Zhi Yong Wu -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html