On Wed, 2011-05-18 at 11:17 +0100, Prasad Joshi wrote: > QCOW uses two tables level1 (L1) table and level2 > (L2) table. The L1 table points to offset of L2 > table. When a QCOW image is probed, the L1 table is > cached in the memory to avoid reading it from disk > on every reference. This caching imporves the > performance. The similar performance improvment can > be observed when L2 tables are also cached. It is > impossible to cache all of the L2 tables because of > the memory constraint. The patch adds L2 table > caching capability for upto 128 L2 tables, it uses > combination of RB tree and List to manage the L2 > cached tables. The link list implementation helps > in building simple LRU structure and RB tree helps > in improving the search time during read/write > operations. > > Signed-off-by: Prasad Joshi <prasadjoshi124@xxxxxxxxx> > --- > tools/kvm/include/kvm/qcow.h | 18 ++++ > tools/kvm/qcow.c | 230 ++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 230 insertions(+), 18 deletions(-) > > diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h > index b6e7493..095566a 100644 > --- a/tools/kvm/include/kvm/qcow.h > +++ b/tools/kvm/include/kvm/qcow.h > @@ -3,6 +3,8 @@ > > #include <linux/types.h> > #include <stdbool.h> > +#include <linux/rbtree.h> > +#include <linux/list.h> > > #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) > > @@ -17,6 +19,16 @@ > #define QCOW2_OFLAG_COMPRESSED (1LL << 62) > #define QCOW2_OFLAG_MASK (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED) > > +#define MAX_CACHE_NODES 128 > + > +struct qcow_l2_cache { > + u64 offset; > + u64 *table; > + struct rb_node node; > + struct list_head list; > + struct qcow *qcow; > +}; > + > struct qcow_table { > u32 table_size; > u64 *l1_table; > @@ -26,6 +38,12 @@ struct qcow { > void *header; > struct qcow_table table; > int fd; > + > + /* Level2 caching data structures */ > + struct qcow_l2_cache cache; Why is 'cache' here? > + struct rb_root root; > + struct list_head lru_list; > + int no_cached; > }; > > struct qcow_header { > diff --git a/tools/kvm/qcow.c b/tools/kvm/qcow.c > index bb2345c..ca61374 100644 > --- a/tools/kvm/qcow.c > +++ b/tools/kvm/qcow.c > @@ -16,6 +16,153 @@ > #include <linux/kernel.h> > #include <linux/types.h> > > +static inline int insert(struct rb_root *root, struct qcow_l2_cache *new) > +{ > + struct rb_node **link = &(root->rb_node), *parent = NULL; > + u64 offset = new->offset; > + > + /* search the tree */ > + while (*link) { > + struct qcow_l2_cache *t; > + > + t = rb_entry(*link, struct qcow_l2_cache, node); > + if (!t) > + goto error; > + > + parent = *link; > + > + if (t->offset > offset) > + link = &(*link)->rb_left; > + else if (t->offset < offset) > + link = &(*link)->rb_right; > + else > + goto out; > + } > + > + /* add new node */ > + rb_link_node(&new->node, parent, link); > + rb_insert_color(&new->node, root); > +out: > + return 0; > +error: > + return -1; > +} > + > +static inline struct qcow_l2_cache *search(struct rb_root *root, u64 offset) > +{ > + struct rb_node *link = root->rb_node; > + > + while (link) { > + struct qcow_l2_cache *t; > + > + t = rb_entry(link, struct qcow_l2_cache, node); > + if (!t) > + goto out; > + > + if (t->offset > offset) > + link = link->rb_left; > + else if (t->offset < offset) > + link = link->rb_right; > + else > + return t; > + } > +out: > + return NULL; > +} > + > +static inline void free_cache(struct qcow *q) > +{ > + struct list_head *pos, *n; > + struct qcow_l2_cache *t; > + struct rb_root *r = &q->root; > + > + list_for_each_safe(pos, n, &q->lru_list) { > + /* Remove cache table from the list and RB tree */ > + list_del(pos); > + t = list_entry(pos, struct qcow_l2_cache, list); > + rb_erase(&t->node, r); > + > + /* Free the cached node */ > + free(t->table); > + free(t); > + } > +} > + > +static inline int cache_table(struct qcow *q, u64 *table, u64 offset) > +{ > + struct qcow_l2_cache *n; > + struct rb_root *r = &q->root; > + struct qcow_l2_cache *lru; > + > + n = search(r, offset); > + if (n) { > + /* Update the LRU state */ > + list_del_init(&n->list); > + list_add_tail(&n->list, &q->lru_list); > + return 0; > + } > + > + lru = NULL; > + if (q->no_cached == MAX_CACHE_NODES) { > + /* Find the node for LRU replacement */ > + lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, list); > + } > + > + n = calloc(1, sizeof(struct qcow_l2_cache)); sizeof(*n) > + if (!n) > + goto error; > + > + n->offset = offset; > + n->table = table; > + n->qcow = q; > + RB_CLEAR_NODE(&n->node); > + INIT_LIST_HEAD(&n->list); > + > + if (lru) { > + /* LRU Replacemen */ Replacement > + rb_replace_node(&lru->node, &n->node, r); rb_replace_node() doesn't re-balance the tree, it's intended to be used when both keys are the same. Doing so for nodes with different keys breaks the rbtree. > + > + /* > + * The new node should be the last node in the LRU list > + * therefore, do not list_replace instead delete the node to > + * be replaced and add a new node at the end of the list. > + */ > + list_del_init(&lru->list); > + list_add_tail(&n->list, &q->lru_list); > + > + /* Free the LRUed node */ > + free(lru->table); > + free(lru); > + } else { > + /* Add in RB Tree: Helps in searching faster */ > + if (insert(r, n) < 0) > + goto error; > + > + /* Add in LRU replacement list */ > + list_add_tail(&n->list, &q->lru_list); > + q->no_cached++; > + } > + > + return 0; > +error: > + free(n); > + return -1; > +} > + > +static inline int search_table(struct qcow *q, u64 **table, u64 offset) > +{ > + struct qcow_l2_cache *c; > + > + *table = NULL; > + > + c = search(&q->root, offset); > + if (!c) > + return -1; > + > + *table = c->table; > + return 0; > +} > + > static inline u64 get_l1_index(struct qcow *q, u64 offset) > { > struct qcow_header *header = q->header; > @@ -37,6 +184,43 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 offset) > return offset & ((1 << header->cluster_bits)-1); > } > > +static int qcow1_read_l2_table(struct qcow *q, u64 **table, u64 offset) > +{ > + struct qcow_header *header = q->header; > + u64 size; > + u64 i; > + u64 *t; > + > + *table = NULL; > + size = 1 << header->l2_bits; > + > + /* search an entry for offset in cache */ > + if (search_table(q, table, offset) >= 0) > + return 0; > + > + t = calloc(size, sizeof(u64)); sizeof(*t) > + if (!t) > + goto error; > + > + /* table not cached: read from the disk */ > + if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0) > + goto error; > + > + /* cache the table */ > + if (cache_table(q, t, offset) < 0) > + goto error; > + > + /* change cached table to CPU's byte-order */ > + for (i = 0; i < size; i++) > + be64_to_cpus(&t[i]); > + > + *table = t; > + return 0; > +error: > + free(t); > + return -1; > +} > + > static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len) > { > struct qcow_header *header = q->header; > @@ -51,8 +235,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst > u64 l1_idx; > u64 l2_idx; > > - l2_table = NULL; > - > cluster_size = 1 << header->cluster_bits; > > l1_idx = get_l1_index(q, offset); > @@ -72,18 +254,16 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst > goto zero_cluster; > > l2_table_size = 1 << header->l2_bits; > - l2_table = calloc(l2_table_size, sizeof(u64)); > - if (!l2_table) > - goto out_error; > > - if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), l2_table_offset) < 0) > + /* read and cache level 2 table */ > + if (qcow1_read_l2_table(q, &l2_table, l2_table_offset) < 0) > goto out_error; > > l2_idx = get_l2_index(q, offset); > if (l2_idx >= l2_table_size) > goto out_error; > > - clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask; > + clust_start = l2_table[l2_idx] & ~header->oflag_mask; > if (!clust_start) > goto zero_cluster; > > @@ -91,7 +271,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst > goto out_error; > > out: > - free(l2_table); > return length; > > zero_cluster: > @@ -221,15 +400,16 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr > if (len > src_len) > len = src_len; > > - l2t = calloc(l2t_sz, sizeof(u64)); > - if (!l2t) > - goto error; > - > l2t_off = table->l1_table[l1t_idx] & ~header->oflag_mask; > if (l2t_off) { > - if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 0) > - goto free_l2; > + /* cache level2 table */ > + if (qcow1_read_l2_table(q, &l2t, l2t_off) < 0) > + goto error; > } else { > + l2t = calloc(l2t_sz, sizeof(u64)); sizeof(*l2t) > + if (!l2t) > + goto error; > + > /* Capture the state of the consistent QCOW image */ > f_sz = file_size(q->fd); > if (!f_sz) > @@ -251,6 +431,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr > goto free_l2; > } > > + if (cache_table(q, l2t, l2t_off) < 0) { > + if (ftruncate(q->fd, f_sz) < 0) > + goto free_l2; > + > + goto free_l2; > + } > + > /* Update the in-core entry */ > table->l1_table[l1t_idx] = l2t_off; > } > @@ -258,17 +445,15 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr > /* Capture the state of the consistent QCOW image */ > f_sz = file_size(q->fd); > if (!f_sz) > - goto free_l2; > + goto error; > > - clust_start = be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask; > + clust_start = l2t[l2t_idx] & ~header->oflag_mask; > if (!clust_start) { > clust_start = ALIGN(f_sz, clust_sz); > update_meta = true; > } else > update_meta = false; > > - free(l2t); > - > /* Write actual data */ > if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0) > goto error; > @@ -282,9 +467,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr > > goto error; > } > + > + /* Update the cached level2 entry */ > + l2t[l2t_idx] = clust_start; > } > > return len; > + > free_l2: > free(l2t); > error: > @@ -335,6 +524,7 @@ static void qcow1_disk_close(struct disk_image *disk) > > q = disk->priv; > > + free_cache(q); > free(q->table.l1_table); > free(q->header); > free(q); > @@ -425,6 +615,8 @@ static struct disk_image *qcow2_probe(int fd, bool readonly) > goto error; > > q->fd = fd; > + q->root = RB_ROOT; > + INIT_LIST_HEAD(&q->lru_list); > > h = q->header = qcow2_read_header(fd); > if (!h) > @@ -519,6 +711,8 @@ static struct disk_image *qcow1_probe(int fd, bool readonly) > goto error; > > q->fd = fd; > + q->root = RB_ROOT; > + INIT_LIST_HEAD(&q->lru_list); > > h = q->header = qcow1_read_header(fd); > if (!h) -- Sasha. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html