Re: [PATCH v3] kvm tools: Add QCOW level2 caching support

Sasha Levin <levinsasha928@xxxxxxxxx> · Fri, 03 Jun 2011 00:19:21 +0300

On Thu, 2011-06-02 at 21:01 +0100, Prasad Joshi wrote:
> QCOW uses two tables level1 (L1) table and level2 (L2) table. The L1 table
> points to offset of L2 table. When a QCOW image is probed, the L1 table is
> cached in the memory to avoid reading it from disk on every access. This
> caching improves the performance.
> 
> The similar performance improvement can be observed when L2 tables are cached.
> It is impossible to cache all of the L2 tables because of the memory
> constraint. The patch adds L2 table caching capability for up to 128 L2 tables,
> it uses combination of RB tree and List to manage the L2 cached tables. The
> link list implementation helps in building simple LRU structure and RB tree
> helps to search cached table efficiently
> 
> The performance numbers are below, the machine was started with following
> command line arguments
> 
> $ ./kvm run -d /home/prasad/VMDisks/Ubuntu10.10_64_cilk_qemu.qcow \
> > --params "root=/dev/vda1" -m 1024
> 
> Without QCOW caching
> ====================
> $ bonnie++ -d tmp/ -c 10 -s 2048
> Writing a byte at a time...done
> Writing intelligently...done
> Rewriting...done
> Reading a byte at a time...done
> Reading intelligently...done
> start 'em...done...done...done...done...done...
> Create files in sequential order...done.
> Stat files in sequential order...done.
> Delete files in sequential order...done.
> Create files in random order...done.
> Stat files in random order...done.
> Delete files in random order...done.
> Version  1.96       ------Sequential Output------ --Sequential Input- --Random-
> Concurrency  10     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> prasad-virtual-m 2G  1043  99 555406  74 227605  55  5360  99 489080  68 +++++ +++
> Latency             24646us   48544us   57893us    6686us    3595us   21026us
> Version  1.96       ------Sequential Create------ --------Random Create--------
> prasad-virtual-mach -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
>               files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
>                  16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
> Latency               343us    1175us     327us     555us      48us      82us
> 1.96,1.96,prasad-virtual-machine,10,1307043085,2G,,1043,99,555406,74,227605,55,
> 5360,99,489080,68,+++++,+++,16,,,,,+++++,+++,+++++,+++,+++++,+++,+++++,+++,
> +++++,+++,+++++,+++,24646us,48544us,57893us,6686us,3595us,21026us,343us,1175us,
> 327us,555us,48us,82us
> 
> With QCOW caching
> =================
> $ bonnie++ -d tmp/ -c 10 -s 2048
> Writing a byte at a time...done
> Writing intelligently...done
> Rewriting...done
> Reading a byte at a time...done
> Reading intelligently...done
> start 'em...done...done...done...done...done...
> Create files in sequential order...done.
> Stat files in sequential order...done.
> Delete files in sequential order...done.
> Create files in random order...done.
> Stat files in random order...done.
> Delete files in random order...done.
> Version  1.96       ------Sequential Output------ --Sequential Input- --Random-
> Concurrency  10     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> prasad-virtual-m 2G  1033  99 467899  64 182387  41  5422 100 338294  48 +++++ +++
> Latency             21549us   60585us   65723us    6331us   30014us   19994us
> Version  1.96       ------Sequential Create------ --------Random Create--------
> prasad-virtual-mach -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
>               files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
>                  16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
> Latency               478us    1142us     344us     402us      72us      98us
> 1.96,1.96,prasad-virtual-machine,10,1307042839,2G,,1033,99,467899,64,182387,41,
> 5422,100,338294,48,+++++,+++,16,,,,,+++++,+++,+++++,+++,+++++,+++,+++++,+++,
> +++++,+++,+++++,+++,21549us,60585us,65723us,6331us,30014us,19994us,478us,1142us,
> 344us,402us,72us,98us
> 
> Summary of performance numbers
> ==============================
> There is not much difference with sequential character operations are
> performed, the code with caching performed better by small margin. The caching
> code performance raised by 18% to 24% with sequential block output and by 44%
> for sequentail block input. Which is understandable as the Level2 table will
> always be cached after a write operation. Random seek operation worked slower
> with caching code.
> 
> Signed-off-by: Prasad Joshi <prasadjoshi124@xxxxxxxxx>
> ---
>  tools/kvm/disk/qcow.c        |  216 ++++++++++++++++++++++++++++++++++++++----
>  tools/kvm/include/kvm/qcow.h |   17 ++++
>  2 files changed, 216 insertions(+), 17 deletions(-)
> 
> diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
> index b52b734..4fa3850 100644
> --- a/tools/kvm/disk/qcow.c
> +++ b/tools/kvm/disk/qcow.c
> @@ -16,6 +16,140 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  
> +static int insert(struct rb_root *root, struct qcow_l2_cache *new)
> +{
> +	struct rb_node **link = &(root->rb_node), *parent = NULL;
> +	u64 offset = new->offset;
> +
> +	/* search the tree */
> +	while (*link) {
> +		struct qcow_l2_cache *t;
> +
> +		t = rb_entry(*link, struct qcow_l2_cache, node);
> +		if (!t)
> +			goto error;
> +
> +		parent = *link;
> +
> +		if (t->offset > offset)
> +			link = &(*link)->rb_left;
> +		else if (t->offset < offset)
> +			link = &(*link)->rb_right;
> +		else
> +			goto out;
> +	}
> +
> +	/* add new node */
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, root);
> +out:
> +	return 0;
> +error:
> +	return -1;
> +}
> +
> +static struct qcow_l2_cache *search(struct rb_root *root, u64 offset)
> +{
> +	struct rb_node *link = root->rb_node;
> +
> +	while (link) {
> +		struct qcow_l2_cache *t;
> +
> +		t = rb_entry(link, struct qcow_l2_cache, node);
> +		if (!t)
> +			goto out;
> +
> +		if (t->offset > offset)
> +			link = link->rb_left;
> +		else if (t->offset < offset)
> +			link = link->rb_right;
> +		else
> +			return t;
> +	}
> +out:
> +	return NULL;
> +}
> +
> +static void free_cache(struct qcow *q)
> +{
> +	struct list_head *pos, *n;
> +	struct qcow_l2_cache *t;
> +	struct rb_root *r = &q->root;
> +
> +	list_for_each_safe(pos, n, &q->lru_list) {
> +		/* Remove cache table from the list and RB tree */
> +		list_del(pos);
> +		t = list_entry(pos, struct qcow_l2_cache, list);
> +		rb_erase(&t->node, r);
> +
> +		/* Free the cached node */
> +		free(t->table);
> +		free(t);
> +	}
> +}
> +
> +static int cache_table(struct qcow *q, u64 *table, u64 offset)
> +{
> +	struct qcow_l2_cache *n;
> +	struct rb_root *r = &q->root;
> +	struct qcow_l2_cache *lru;
> +
> +	n = calloc(1, sizeof(struct qcow_l2_cache));
			sizeof(*n)
sizeof() should use the variable name itself, not the data type. Check
out chapter 14 in 'Documentation/CodingStyle'.

> +	if (!n)
> +		goto error;
> +
> +	n->offset = offset;
> +	n->table  = table;
> +	n->qcow   = q;
> +	RB_CLEAR_NODE(&n->node);
> +	INIT_LIST_HEAD(&n->list);
> +
> +	if (q->no_cached == MAX_CACHE_NODES) {
> +		/* Find the node for LRU replacement */
> +		lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, list);
> +
> +		/* Remove the node from the cache */
> +		rb_erase(&lru->node, r);
> +		list_del_init(&lru->list);
> +		q->no_cached--;
> +
> +		/* Free the LRUed node */
> +		free(lru->table);
> +		free(lru);
> +	}
> +
> +	/* Add new node in RB Tree: Helps in searching faster */
> +	if (insert(r, n) < 0)
> +		goto error;
> +
> +	/* Add in LRU replacement list */
> +	list_add_tail(&n->list, &q->lru_list);
> +	q->no_cached++;
> +
> +	return 0;
> +error:
> +	free(n);
> +	return -1;
> +}
> +
> +static int search_table(struct qcow *q, u64 **table, u64 offset)
> +{
> +	struct qcow_l2_cache *c;
> +
> +	*table = NULL;
> +
> +	c = search(&q->root, offset);
> +	if (!c)
> +		return -1;
> +
> +	/* Update the LRU state */
> +	list_del_init(&c->list);
> +	list_add_tail(&c->list, &q->lru_list);
> +
> +	*table = c->table;
> +	return 0;
> +}
> +
>  static inline u64 get_l1_index(struct qcow *q, u64 offset)
>  {
>  	struct qcow_header *header = q->header;
> @@ -37,6 +171,43 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
>  	return offset & ((1 << header->cluster_bits)-1);
>  }
>  
> +static int qcow_read_l2_table(struct qcow *q, u64 **table, u64 offset)
> +{
> +	struct qcow_header *header = q->header;
> +	u64 size;
> +	u64 i;
> +	u64 *t;
> +
> +	*table = NULL;
> +	size   = 1 << header->l2_bits;
> +
> +	/* search an entry for offset in cache */
> +	if (search_table(q, table, offset) >= 0)
> +		return 0;
> +
> +	t = calloc(size, sizeof(*t));
> +	if (!t)
> +		goto error;
> +
> +	/* table not cached: read from the disk */
> +	if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0)
> +		goto error;
> +
> +	/* cache the table */
> +	if (cache_table(q, t, offset) < 0)
> +		goto error;
> +
> +	/* change cached table to CPU's byte-order */
> +	for (i = 0; i < size; i++)
> +		be64_to_cpus(&t[i]);
> +
> +	*table = t;
> +	return 0;
> +error:
> +	free(t);
> +	return -1;
> +}
> +
>  static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len)
>  {
>  	struct qcow_header *header = q->header;
> @@ -51,7 +222,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
>  	u64 l1_idx;
>  	u64 l2_idx;
>  
> -	l2_table = NULL;
>  
>  	cluster_size = 1 << header->cluster_bits;
>  
> @@ -72,18 +242,16 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
>  		goto zero_cluster;
>  
>  	l2_table_size = 1 << header->l2_bits;
> -	l2_table = calloc(l2_table_size, sizeof(u64));
> -	if (!l2_table)
> -		goto out_error;
>  
> -	if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), l2_table_offset) < 0)
> +	/* read and cache level 2 table */
> +	if (qcow_read_l2_table(q, &l2_table, l2_table_offset) < 0)
>  		goto out_error;
>  
>  	l2_idx = get_l2_index(q, offset);
>  	if (l2_idx >= l2_table_size)
>  		goto out_error;
>  
> -	clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask;
> +	clust_start = l2_table[l2_idx] & ~header->oflag_mask;
>  	if (!clust_start)
>  		goto zero_cluster;
>  
> @@ -91,7 +259,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
>  		goto out_error;
>  
>  out:
> -	free(l2_table);
>  	return length;
>  
>  zero_cluster:
> @@ -221,15 +388,16 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
>  	if (len > src_len)
>  		len = src_len;
>  
> -	l2t = calloc(l2t_sz, sizeof(u64));
> -	if (!l2t)
> -		goto error;
> -
>  	l2t_off		= table->l1_table[l1t_idx] & ~header->oflag_mask;
>  	if (l2t_off) {
> -		if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 0)
> -			goto free_l2;
> +		/* cache level2 table */
> +		if (qcow_read_l2_table(q, &l2t, l2t_off) < 0)
> +			goto error;
>  	} else {
> +		l2t = calloc(l2t_sz, sizeof(*l2t));
> +		if (!l2t)
> +			goto error;
> +
>  		/* Capture the state of the consistent QCOW image */
>  		f_sz		= file_size(q->fd);
>  		if (!f_sz)
> @@ -251,6 +419,13 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
>  			goto free_l2;
>  		}
>  
> +		if (cache_table(q, l2t, l2t_off) < 0) {
> +			if (ftruncate(q->fd, f_sz) < 0)
> +				goto free_l2;
> +
> +			goto free_l2;
> +		}
> +
>  		/* Update the in-core entry */
>  		table->l1_table[l1t_idx] = l2t_off;
>  	}
> @@ -258,17 +433,15 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
>  	/* Capture the state of the consistent QCOW image */
>  	f_sz		= file_size(q->fd);
>  	if (!f_sz)
> -		goto free_l2;
> +		goto error;
>  
> -	clust_start	= be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask;
> +	clust_start	= l2t[l2t_idx] & ~header->oflag_mask;
>  	if (!clust_start) {
>  		clust_start	= ALIGN(f_sz, clust_sz);
>  		update_meta	= true;
>  	} else
>  		update_meta	= false;
>  
> -	free(l2t);
> -
>  	/* Write actual data */
>  	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
>  		goto error;
> @@ -282,9 +455,13 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
>  
>  			goto error;
>  		}
> +
> +		/* Update the cached level2 entry */
> +		l2t[l2t_idx] = clust_start;
>  	}
>  
>  	return len;
> +
>  free_l2:
>  	free(l2t);
>  error:
> @@ -336,6 +513,7 @@ static int qcow_disk_close(struct disk_image *disk)
>  
>  	q = disk->priv;
>  
> +	free_cache(q);
>  	free(q->table.l1_table);
>  	free(q->header);
>  	free(q);
> @@ -428,6 +606,8 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
>  		goto error;
>  
>  	q->fd = fd;
> +	q->root = RB_ROOT;
> +	INIT_LIST_HEAD(&q->lru_list);
>  
>  	h = q->header = qcow2_read_header(fd);
>  	if (!h)
> @@ -525,6 +705,8 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
>  		goto error;
>  
>  	q->fd = fd;
> +	q->root = RB_ROOT;
> +	INIT_LIST_HEAD(&q->lru_list);
>  
>  	h = q->header = qcow1_read_header(fd);
>  	if (!h)
> diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
> index b6e7493..1663819 100644
> --- a/tools/kvm/include/kvm/qcow.h
> +++ b/tools/kvm/include/kvm/qcow.h
> @@ -3,6 +3,8 @@
>  
>  #include <linux/types.h>
>  #include <stdbool.h>
> +#include <linux/rbtree.h>
> +#include <linux/list.h>
>  
>  #define QCOW_MAGIC		(('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
>  
> @@ -17,6 +19,16 @@
>  #define QCOW2_OFLAG_COMPRESSED	(1LL << 62)
>  #define QCOW2_OFLAG_MASK	(QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
>  
> +#define MAX_CACHE_NODES         128
> +
> +struct qcow_l2_cache {
> +	u64                     offset;
> +	u64                     *table;
> +	struct rb_node          node;
> +	struct list_head        list;
> +	struct qcow             *qcow;

I only see 'qcow' being initialized, but isn't used anywhere. Is it
really needed?

> +};
> +
>  struct qcow_table {
>  	u32			table_size;
>  	u64			*l1_table;
> @@ -26,6 +38,11 @@ struct qcow {
>  	void			*header;
>  	struct qcow_table	table;
>  	int			fd;
> +
> +	/* Level2 caching data structures */
> +	struct rb_root          root;
> +	struct list_head        lru_list;
> +	int                     no_cached;
>  };
>  
>  struct qcow_header {

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html