On Thu, 2011-06-02 at 21:01 +0100, Prasad Joshi wrote: > QCOW uses two tables level1 (L1) table and level2 (L2) table. The L1 table > points to offset of L2 table. When a QCOW image is probed, the L1 table is > cached in the memory to avoid reading it from disk on every access. This > caching improves the performance. > > The similar performance improvement can be observed when L2 tables are cached. > It is impossible to cache all of the L2 tables because of the memory > constraint. The patch adds L2 table caching capability for up to 128 L2 tables, > it uses combination of RB tree and List to manage the L2 cached tables. The > link list implementation helps in building simple LRU structure and RB tree > helps to search cached table efficiently > > The performance numbers are below, the machine was started with following > command line arguments > > $ ./kvm run -d /home/prasad/VMDisks/Ubuntu10.10_64_cilk_qemu.qcow \ > > --params "root=/dev/vda1" -m 1024 > > Without QCOW caching > ==================== > $ bonnie++ -d tmp/ -c 10 -s 2048 > Writing a byte at a time...done > Writing intelligently...done > Rewriting...done > Reading a byte at a time...done > Reading intelligently...done > start 'em...done...done...done...done...done... > Create files in sequential order...done. > Stat files in sequential order...done. > Delete files in sequential order...done. > Create files in random order...done. > Stat files in random order...done. > Delete files in random order...done. > Version 1.96 ------Sequential Output------ --Sequential Input- --Random- > Concurrency 10 -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- > Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP > prasad-virtual-m 2G 1043 99 555406 74 227605 55 5360 99 489080 68 +++++ +++ > Latency 24646us 48544us 57893us 6686us 3595us 21026us > Version 1.96 ------Sequential Create------ --------Random Create-------- > prasad-virtual-mach -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- > files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP > 16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ > Latency 343us 1175us 327us 555us 48us 82us > 1.96,1.96,prasad-virtual-machine,10,1307043085,2G,,1043,99,555406,74,227605,55, > 5360,99,489080,68,+++++,+++,16,,,,,+++++,+++,+++++,+++,+++++,+++,+++++,+++, > +++++,+++,+++++,+++,24646us,48544us,57893us,6686us,3595us,21026us,343us,1175us, > 327us,555us,48us,82us > > With QCOW caching > ================= > $ bonnie++ -d tmp/ -c 10 -s 2048 > Writing a byte at a time...done > Writing intelligently...done > Rewriting...done > Reading a byte at a time...done > Reading intelligently...done > start 'em...done...done...done...done...done... > Create files in sequential order...done. > Stat files in sequential order...done. > Delete files in sequential order...done. > Create files in random order...done. > Stat files in random order...done. > Delete files in random order...done. > Version 1.96 ------Sequential Output------ --Sequential Input- --Random- > Concurrency 10 -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- > Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP > prasad-virtual-m 2G 1033 99 467899 64 182387 41 5422 100 338294 48 +++++ +++ > Latency 21549us 60585us 65723us 6331us 30014us 19994us > Version 1.96 ------Sequential Create------ --------Random Create-------- > prasad-virtual-mach -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- > files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP > 16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ > Latency 478us 1142us 344us 402us 72us 98us > 1.96,1.96,prasad-virtual-machine,10,1307042839,2G,,1033,99,467899,64,182387,41, > 5422,100,338294,48,+++++,+++,16,,,,,+++++,+++,+++++,+++,+++++,+++,+++++,+++, > +++++,+++,+++++,+++,21549us,60585us,65723us,6331us,30014us,19994us,478us,1142us, > 344us,402us,72us,98us > > Summary of performance numbers > ============================== > There is not much difference with sequential character operations are > performed, the code with caching performed better by small margin. The caching > code performance raised by 18% to 24% with sequential block output and by 44% > for sequentail block input. Which is understandable as the Level2 table will > always be cached after a write operation. Random seek operation worked slower > with caching code. > > Signed-off-by: Prasad Joshi <prasadjoshi124@xxxxxxxxx> > --- > tools/kvm/disk/qcow.c | 216 ++++++++++++++++++++++++++++++++++++++---- > tools/kvm/include/kvm/qcow.h | 17 ++++ > 2 files changed, 216 insertions(+), 17 deletions(-) > > diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c > index b52b734..4fa3850 100644 > --- a/tools/kvm/disk/qcow.c > +++ b/tools/kvm/disk/qcow.c > @@ -16,6 +16,140 @@ > #include <linux/kernel.h> > #include <linux/types.h> > > +static int insert(struct rb_root *root, struct qcow_l2_cache *new) > +{ > + struct rb_node **link = &(root->rb_node), *parent = NULL; > + u64 offset = new->offset; > + > + /* search the tree */ > + while (*link) { > + struct qcow_l2_cache *t; > + > + t = rb_entry(*link, struct qcow_l2_cache, node); > + if (!t) > + goto error; > + > + parent = *link; > + > + if (t->offset > offset) > + link = &(*link)->rb_left; > + else if (t->offset < offset) > + link = &(*link)->rb_right; > + else > + goto out; > + } > + > + /* add new node */ > + rb_link_node(&new->node, parent, link); > + rb_insert_color(&new->node, root); > +out: > + return 0; > +error: > + return -1; > +} > + > +static struct qcow_l2_cache *search(struct rb_root *root, u64 offset) > +{ > + struct rb_node *link = root->rb_node; > + > + while (link) { > + struct qcow_l2_cache *t; > + > + t = rb_entry(link, struct qcow_l2_cache, node); > + if (!t) > + goto out; > + > + if (t->offset > offset) > + link = link->rb_left; > + else if (t->offset < offset) > + link = link->rb_right; > + else > + return t; > + } > +out: > + return NULL; > +} > + > +static void free_cache(struct qcow *q) > +{ > + struct list_head *pos, *n; > + struct qcow_l2_cache *t; > + struct rb_root *r = &q->root; > + > + list_for_each_safe(pos, n, &q->lru_list) { > + /* Remove cache table from the list and RB tree */ > + list_del(pos); > + t = list_entry(pos, struct qcow_l2_cache, list); > + rb_erase(&t->node, r); > + > + /* Free the cached node */ > + free(t->table); > + free(t); > + } > +} > + > +static int cache_table(struct qcow *q, u64 *table, u64 offset) > +{ > + struct qcow_l2_cache *n; > + struct rb_root *r = &q->root; > + struct qcow_l2_cache *lru; > + > + n = calloc(1, sizeof(struct qcow_l2_cache)); sizeof(*n) sizeof() should use the variable name itself, not the data type. Check out chapter 14 in 'Documentation/CodingStyle'. > + if (!n) > + goto error; > + > + n->offset = offset; > + n->table = table; > + n->qcow = q; > + RB_CLEAR_NODE(&n->node); > + INIT_LIST_HEAD(&n->list); > + > + if (q->no_cached == MAX_CACHE_NODES) { > + /* Find the node for LRU replacement */ > + lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, list); > + > + /* Remove the node from the cache */ > + rb_erase(&lru->node, r); > + list_del_init(&lru->list); > + q->no_cached--; > + > + /* Free the LRUed node */ > + free(lru->table); > + free(lru); > + } > + > + /* Add new node in RB Tree: Helps in searching faster */ > + if (insert(r, n) < 0) > + goto error; > + > + /* Add in LRU replacement list */ > + list_add_tail(&n->list, &q->lru_list); > + q->no_cached++; > + > + return 0; > +error: > + free(n); > + return -1; > +} > + > +static int search_table(struct qcow *q, u64 **table, u64 offset) > +{ > + struct qcow_l2_cache *c; > + > + *table = NULL; > + > + c = search(&q->root, offset); > + if (!c) > + return -1; > + > + /* Update the LRU state */ > + list_del_init(&c->list); > + list_add_tail(&c->list, &q->lru_list); > + > + *table = c->table; > + return 0; > +} > + > static inline u64 get_l1_index(struct qcow *q, u64 offset) > { > struct qcow_header *header = q->header; > @@ -37,6 +171,43 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 offset) > return offset & ((1 << header->cluster_bits)-1); > } > > +static int qcow_read_l2_table(struct qcow *q, u64 **table, u64 offset) > +{ > + struct qcow_header *header = q->header; > + u64 size; > + u64 i; > + u64 *t; > + > + *table = NULL; > + size = 1 << header->l2_bits; > + > + /* search an entry for offset in cache */ > + if (search_table(q, table, offset) >= 0) > + return 0; > + > + t = calloc(size, sizeof(*t)); > + if (!t) > + goto error; > + > + /* table not cached: read from the disk */ > + if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0) > + goto error; > + > + /* cache the table */ > + if (cache_table(q, t, offset) < 0) > + goto error; > + > + /* change cached table to CPU's byte-order */ > + for (i = 0; i < size; i++) > + be64_to_cpus(&t[i]); > + > + *table = t; > + return 0; > +error: > + free(t); > + return -1; > +} > + > static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len) > { > struct qcow_header *header = q->header; > @@ -51,7 +222,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_ > u64 l1_idx; > u64 l2_idx; > > - l2_table = NULL; > > cluster_size = 1 << header->cluster_bits; > > @@ -72,18 +242,16 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_ > goto zero_cluster; > > l2_table_size = 1 << header->l2_bits; > - l2_table = calloc(l2_table_size, sizeof(u64)); > - if (!l2_table) > - goto out_error; > > - if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), l2_table_offset) < 0) > + /* read and cache level 2 table */ > + if (qcow_read_l2_table(q, &l2_table, l2_table_offset) < 0) > goto out_error; > > l2_idx = get_l2_index(q, offset); > if (l2_idx >= l2_table_size) > goto out_error; > > - clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask; > + clust_start = l2_table[l2_idx] & ~header->oflag_mask; > if (!clust_start) > goto zero_cluster; > > @@ -91,7 +259,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_ > goto out_error; > > out: > - free(l2_table); > return length; > > zero_cluster: > @@ -221,15 +388,16 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src > if (len > src_len) > len = src_len; > > - l2t = calloc(l2t_sz, sizeof(u64)); > - if (!l2t) > - goto error; > - > l2t_off = table->l1_table[l1t_idx] & ~header->oflag_mask; > if (l2t_off) { > - if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 0) > - goto free_l2; > + /* cache level2 table */ > + if (qcow_read_l2_table(q, &l2t, l2t_off) < 0) > + goto error; > } else { > + l2t = calloc(l2t_sz, sizeof(*l2t)); > + if (!l2t) > + goto error; > + > /* Capture the state of the consistent QCOW image */ > f_sz = file_size(q->fd); > if (!f_sz) > @@ -251,6 +419,13 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src > goto free_l2; > } > > + if (cache_table(q, l2t, l2t_off) < 0) { > + if (ftruncate(q->fd, f_sz) < 0) > + goto free_l2; > + > + goto free_l2; > + } > + > /* Update the in-core entry */ > table->l1_table[l1t_idx] = l2t_off; > } > @@ -258,17 +433,15 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src > /* Capture the state of the consistent QCOW image */ > f_sz = file_size(q->fd); > if (!f_sz) > - goto free_l2; > + goto error; > > - clust_start = be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask; > + clust_start = l2t[l2t_idx] & ~header->oflag_mask; > if (!clust_start) { > clust_start = ALIGN(f_sz, clust_sz); > update_meta = true; > } else > update_meta = false; > > - free(l2t); > - > /* Write actual data */ > if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0) > goto error; > @@ -282,9 +455,13 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src > > goto error; > } > + > + /* Update the cached level2 entry */ > + l2t[l2t_idx] = clust_start; > } > > return len; > + > free_l2: > free(l2t); > error: > @@ -336,6 +513,7 @@ static int qcow_disk_close(struct disk_image *disk) > > q = disk->priv; > > + free_cache(q); > free(q->table.l1_table); > free(q->header); > free(q); > @@ -428,6 +606,8 @@ static struct disk_image *qcow2_probe(int fd, bool readonly) > goto error; > > q->fd = fd; > + q->root = RB_ROOT; > + INIT_LIST_HEAD(&q->lru_list); > > h = q->header = qcow2_read_header(fd); > if (!h) > @@ -525,6 +705,8 @@ static struct disk_image *qcow1_probe(int fd, bool readonly) > goto error; > > q->fd = fd; > + q->root = RB_ROOT; > + INIT_LIST_HEAD(&q->lru_list); > > h = q->header = qcow1_read_header(fd); > if (!h) > diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h > index b6e7493..1663819 100644 > --- a/tools/kvm/include/kvm/qcow.h > +++ b/tools/kvm/include/kvm/qcow.h > @@ -3,6 +3,8 @@ > > #include <linux/types.h> > #include <stdbool.h> > +#include <linux/rbtree.h> > +#include <linux/list.h> > > #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) > > @@ -17,6 +19,16 @@ > #define QCOW2_OFLAG_COMPRESSED (1LL << 62) > #define QCOW2_OFLAG_MASK (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED) > > +#define MAX_CACHE_NODES 128 > + > +struct qcow_l2_cache { > + u64 offset; > + u64 *table; > + struct rb_node node; > + struct list_head list; > + struct qcow *qcow; I only see 'qcow' being initialized, but isn't used anywhere. Is it really needed? > +}; > + > struct qcow_table { > u32 table_size; > u64 *l1_table; > @@ -26,6 +38,11 @@ struct qcow { > void *header; > struct qcow_table table; > int fd; > + > + /* Level2 caching data structures */ > + struct rb_root root; > + struct list_head lru_list; > + int no_cached; > }; > > struct qcow_header { -- Sasha. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html