Re: [RFC v2 PATCH] kvm tools, qcow: Add the support for copy-on-write clusters

Kevin Wolf <kwolf@xxxxxxxxxx> · Fri, 18 Nov 2011 11:10:28 +0100

Am 18.11.2011 09:47, schrieb Lan Tianyu:
> When meeting request to write the cluster without copied flag,
> allocate a new cluster and write original data with modification
> to the new cluster. This also adds support for the writing operation
> of the qcow2 compressed image. After testing, image file can pass
> through "qemu-img check".
> 
>  Please enter the commit message for your changes. Lines starting
> 
> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
> ---
>  tools/kvm/disk/qcow.c        |  366 +++++++++++++++++++++++++++++-------------
>  tools/kvm/include/kvm/qcow.h |    2 +
>  2 files changed, 255 insertions(+), 113 deletions(-)
> 
> diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
> index 680b37d..4d9125d 100644
> --- a/tools/kvm/disk/qcow.c
> +++ b/tools/kvm/disk/qcow.c
> @@ -122,9 +122,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c)
>  		 */
>  		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
>  
> -		if (qcow_l2_cache_write(q, lru) < 0)
> -			goto error;
> -
>  		/* Remove the node from the cache */
>  		rb_erase(&lru->node, r);
>  		list_del_init(&lru->list);
> @@ -618,9 +615,6 @@ static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
>  	if (rft->nr_cached == MAX_CACHE_NODES) {
>  		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
>  
> -		if (write_refcount_block(q, lru) < 0)
> -			goto error;
> -
>  		rb_erase(&lru->node, r);
>  		list_del_init(&lru->list);
>  		rft->nr_cached--;
> @@ -706,6 +700,11 @@ static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64
>  
>  	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
>  
> +	if (!rfb_offset) {
> +		pr_warning("Don't support to grow refcount table");
> +		return NULL;
> +	}
> +
>  	rfb = refcount_block_search(q, rfb_offset);
>  	if (rfb)
>  		return rfb;
> @@ -728,35 +727,121 @@ error_free_rfb:
>  	return NULL;
>  }
>  
> -/*
> - * QCOW file might grow during a write operation. Not only data but metadata is
> - * also written at the end of the file. Therefore it is necessary to ensure
> - * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
> - * synchronize the in-core state of QCOW image to disk.
> - *
> - * We also try to restore the image to a consistent state if the metdata
> - * operation fails. The two metadat operations are: level 1 and level 2 table
> - * update. If either of them fails the image is truncated to a consistent state.
> +static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
> +{
> +	struct qcow_refcount_block *rfb = NULL;
> +	struct qcow_header *header = q->header;
> +	u64 rfb_idx;
> +
> +	rfb = qcow_read_refcount_block(q, clust_idx);
> +	if (!rfb) {
> +		pr_warning("Error while reading refcount table");
> +		return -1;
> +	}
> +
> +	rfb_idx = clust_idx & (((1ULL <<
> +		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> +
> +	if (rfb_idx >= rfb->size) {
> +		pr_warning("L1: refcount block index out of bounds");
> +		return -1;
> +	}
> +
> +	return be16_to_cpu(rfb->entries[rfb_idx]);
> +}
> +
> +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
> +{
> +	struct qcow_refcount_block *rfb = NULL;
> +	struct qcow_header *header = q->header;
> +	u16 refcount;
> +	u64 rfb_idx;
> +
> +	rfb = qcow_read_refcount_block(q, clust_idx);
> +	if (!rfb) {
> +		pr_warning("error while reading refcount table");
> +		return -1;
> +	}
> +
> +	rfb_idx = clust_idx & (((1ULL <<
> +		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> +	if (rfb_idx >= rfb->size) {
> +		pr_warning("refcount block index out of bounds");
> +		return -1;
> +	}
> +
> +	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
> +	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
> +	rfb->dirty = 1;
> +
> +	/*write refcount block*/
> +	write_refcount_block(q, rfb);

Missing error handling.

> +
> +	/*update free_clust_idx since refcount becomes zero*/
> +	if (!refcount && clust_idx < q->free_clust_idx)
> +		q->free_clust_idx = clust_idx;
> +
> +	return 0;
> +}
> +
> +static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
> +{
> +	struct qcow_header *header = q->header;
> +	u64 start, end, offset;
> +
> +	start = clust_start & ~(q->cluster_size - 1);
> +	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
> +	for (offset = start; offset <= end; offset += q->cluster_size)
> +		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
> +}
> +
> +/*Allocate clusters according to the size. Find a postion that
> + *can satisfy the size. free_clust_idx is initialized to zero and
> + *Record last position.
> +*/
> +static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
> +{
> +	struct qcow_header *header = q->header;
> +	u16 clust_refcount;
> +	u32 clust_idx, i;
> +	u64 clust_num;
> +
> +	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
> +
> +again:
> +	for (i = 0; i < clust_num; i++) {
> +		clust_idx = q->free_clust_idx++;
> +		clust_refcount = qcow_get_refcount(q, clust_idx);
> +		if (clust_refcount < 0)
> +			return -1;
> +		else if (clust_refcount > 0)
> +			goto again;
> +	}
> +
> +	for (i = 0; i < clust_num; i++)
> +		update_cluster_refcount(q,
> +			q->free_clust_idx - clust_num + i, 1);

Error handling again.

> +
> +	return (q->free_clust_idx - clust_num) << header->cluster_bits;
> +}
> +
> +/*Get l2 table. If the table has been copied, read table directly.
> + *If the table exists, allocate a new cluster and copy the table
> + *to the new cluster.
>   */
> -static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
> +static int get_cluster_table(struct qcow *q, u64 offset,
> +	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
>  {
>  	struct qcow_header *header = q->header;
>  	struct qcow_l1_table *l1t = &q->table;
>  	struct qcow_l2_table *l2t;
> -	u64 clust_start;
> -	u64 clust_flags;
> -	u64 l2t_offset;
> -	u64 clust_off;
> -	u64 l2t_size;
> -	u64 clust_sz;
>  	u64 l1t_idx;
> +	u64 l2t_offset;
>  	u64 l2t_idx;
> -	u64 f_sz;
> -	u64 len;
> +	u64 l2t_size;
> +	u64 l2t_new_offset;
>  
> -	l2t		= NULL;
> -	l2t_size	= 1 << header->l2_bits;
> -	clust_sz	= 1 << header->cluster_bits;
> +	l2t_size = 1 << header->l2_bits;
>  
>  	l1t_idx = get_l1_index(q, offset);
>  	if (l1t_idx >= l1t->table_size)
> @@ -766,122 +851,166 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
>  	if (l2t_idx >= l2t_size)
>  		return -1;
>  
> -	clust_off = get_cluster_offset(q, offset);
> -	if (clust_off >= clust_sz)
> -		return -1;
> -
> -	len = clust_sz - clust_off;
> -	if (len > src_len)
> -		len = src_len;
> -
> -	mutex_lock(&q->mutex);
> -
>  	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
> -	if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
> -		pr_warning("compressed clusters are not supported");
> -		goto error;
> -	}
> -	if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
> -		pr_warning("L2 copy-on-write clusters are not supported");
> -		goto error;
> -	}
> -
> -	l2t_offset &= QCOW2_OFFSET_MASK;
> -	if (l2t_offset) {
> -		/* read and cache l2 table */
> +	if (l2t_offset & QCOW2_OFLAG_COPIED) {
> +		l2t_offset &= ~QCOW2_OFLAG_COPIED;
>  		l2t = qcow_read_l2_table(q, l2t_offset);
>  		if (!l2t)
>  			goto error;
>  	} else {
> -		l2t = new_cache_table(q, l2t_offset);
> -		if (!l2t)
> +		l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
> +		if (l2t_new_offset < 0)
>  			goto error;
>  
> -		/* Capture the state of the consistent QCOW image */
> -		f_sz = file_size(q->fd);
> -		if (!f_sz)
> -			goto free_cache;
> +		l2t = new_cache_table(q, l2t_new_offset);
> +		if (!l2t)
> +			goto free_cluster;
> +
> +		if (l2t_offset)
> +			qcow2_read_cluster(q, l2t_offset, l2t->table,
> +				l2t_size*sizeof(u64));

There must be a system behind it. :-)

> +		else
> +			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
>  
> -		/* Write the l2 table of 0's at the end of the file */
> -		l2t_offset = qcow_write_l2_table(q, l2t->table);
> -		if (!l2t_offset)
> +		/*write l2 table*/
> +		l2t->dirty = 1;
> +		if (qcow_l2_cache_write(q, l2t) < 0)
>  			goto free_cache;

You need to make sure that the refcount update is written first (e.g.
with fsync), otherwise you risk corruption when the host crashes in the
middle.

>  
> -		if (cache_table(q, l2t) < 0) {
> -			if (ftruncate(q->fd, f_sz) < 0)
> -				goto free_cache;
> +		/* Update the l1 talble */
> +		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
> +			| QCOW2_OFLAG_COPIED);
>  
> -			goto free_cache;
> -		}
> +		if (pwrite_in_full(q->fd, l1t->l1_table,
> +			l1t->table_size * sizeof(u64),
> +			header->l1_table_offset) < 0)
> +			goto error;

Likewise, the L1 table write must be ordered against the L2 write.

goto error is using the wrong label.

>  
> -		/* Update the in-core entry */
> -		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
> +		/*cache l2 table*/
> +		cache_table(q, l2t);

After so many explicit comments, you can probably guess what's wrong here...

> +
> +		/*free old cluster*/
> +		qcow_free_clusters(q, l2t_offset, q->cluster_size);
>  	}
>  
> -	/* Capture the state of the consistent QCOW image */
> -	f_sz		= file_size(q->fd);
> -	if (!f_sz)
> -		goto error;
> +	*result_l2t = l2t;
> +	*result_l2_index = l2t_idx;
>  
> -	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> +	return 0;
>  
> -	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
> -	if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> -		pr_warning("compressed clusters are not supported");
> +free_cache:
> +	free(l2t);
> +
> +free_cluster:
> +	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
> +
> +error:
> +	return -1;
> +}
> +
> +/*If the cluster has been copied, write data directly. If not,
> + *read the original data and write it to the new cluster with
> + *modification.
> +*/
> +static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
> +		void *buf, u32 src_len)
> +{
> +	struct qcow_header *header = q->header;
> +	struct qcow_l2_table *l2t;
> +	u64 clust_new_idx;
> +	u64 clust_new_start;
> +	u64 clust_start;
> +	u64 clust_flags;
> +	u64 clust_off;
> +	u64 l2t_idx;
> +	u64 len;
> +
> +	l2t = NULL;
> +
> +	clust_off = get_cluster_offset(q, offset);
> +	if (clust_off >= q->cluster_size)
> +		return -1;
> +
> +	len = q->cluster_size - clust_off;
> +	if (len > src_len)
> +		len = src_len;
> +
> +	mutex_lock(&q->mutex);
> +
> +	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
> +		pr_warning("Get l2 table error");
>  		goto error;
>  	}
>  
> -	clust_start &= QCOW2_OFFSET_MASK;
> -	if (!clust_start) {
> -		clust_start		= ALIGN(f_sz, clust_sz);
> -		l2t->table[l2t_idx]	= cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED);
> -		l2t->dirty		= 1;
> -	}
> +	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> +	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
>  
> +	clust_start &= QCOW2_OFFSET_MASK;
>  	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
> -		struct qcow_refcount_block *rfb = NULL;
> -		u16 clust_refcount;
> -		u64 clust_idx;
> -		u64 rfb_idx;
> -
> -		clust_idx = (clust_start & QCOW2_OFFSET_MASK)
> -			>> (header->cluster_bits);
>  
> -		rfb = qcow_read_refcount_block(q, clust_idx);
> -		if (!rfb) {
> -			pr_warning("L1: error while reading refcount table");
> +		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size);
> +		if (clust_new_start < 0) {
> +			pr_warning("Cluster alloc error!");
>  			goto error;
>  		}
>  
> -		rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> -		if (rfb_idx >= rfb->size) {
> -			pr_warning("L1: refcount block index out of bounds");
> -			goto error;
> -		}
> +		clust_new_idx = clust_new_start >> header->cluster_bits;
> +		offset &= ~(q->cluster_size - 1);
> +
> +		/*if clust_start is not zero, read the original data*/
> +		if (clust_start) {
> +			mutex_unlock(&q->mutex);
> +			if (qcow2_read_cluster(q, offset, q->copy_buff,
> +				q->cluster_size) < 0) {
> +				pr_warning("Read copy cluster error");
> +				qcow_free_clusters(q, clust_new_start,
> +					q->cluster_size);
> +				return -1;
> +			}
> +			mutex_lock(&q->mutex);
> +		} else
> +			memset(q->copy_buff, 0x00, q->cluster_size);
> +
> +		memcpy(q->copy_buff + clust_off, buf, len);
> +
> +		 /* Write actual data */
> +		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
> +			clust_new_start) < 0)
> +			goto free_cluster;
> +
> +		/*update l2 table*/
> +		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
> +			| QCOW2_OFLAG_COPIED);
> +		l2t->dirty = 1;
> +
> +		if (qcow_l2_cache_write(q, l2t))
> +			goto free_cluster;

Cluster allocation must be ordered against L2 update.

> +
> +		/*free old cluster*/
> +		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> +			int size;
> +			size = ((clust_start >> q->csize_shift) &
> +				q->csize_mask) + 1;
> +			size *= 512;
> +			clust_start &= q->cluster_offset_mask;
> +			clust_start &= ~511;
> +
> +			qcow_free_clusters(q, clust_start, size);
> +		} else if (clust_start)
> +			qcow_free_clusters(q, clust_start, q->cluster_size);
>  
> -		clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
> -		if (!clust_refcount) {
> -			clust_refcount = 1;
> -			rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
> -			rfb->dirty = 1;
> -		}
> -
> -		if (clust_refcount > 1) {
> -			pr_warning("L1 copy-on-write clusters are not supported");
> +	} else {
> +		/* Write actual data */
> +		if (pwrite_in_full(q->fd, buf, len,
> +			clust_start + clust_off) < 0)
>  			goto error;
> -		}
>  	}
> -
>  	mutex_unlock(&q->mutex);
> -
> -	/* Write actual data */
> -	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
> -		return -1;
> -
>  	return len;
>  
> -free_cache:
> -	free(l2t);
> +free_cluster:
> +	qcow_free_clusters(q, clust_new_start, q->cluster_size);
> +
>  error:
>  	mutex_unlock(&q->mutex);
>  	return -1;
> @@ -993,6 +1122,7 @@ static int qcow_disk_close(struct disk_image *disk)
>  
>  	refcount_table_free_cache(&q->refcount_table);
>  	l1_table_free_cache(&q->table);
> +	free(q->copy_buff);
>  	free(q->cluster_data);
>  	free(q->cluster_cache);
>  	free(q->refcount_table.rf_table);
> @@ -1117,10 +1247,16 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
>  	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
>  	q->cluster_size = 1 << q->header->cluster_bits;
>  
> +	q->copy_buff = malloc(q->cluster_size);
> +	if (!q->copy_buff) {
> +		pr_warning("copy buff malloc error!");
> +		goto free_header;
> +	}
> +
>  	q->cluster_data = malloc(q->cluster_size);
>  	if (!q->cluster_data) {
>  		pr_warning("cluster data malloc error!");
> -		goto free_header;
> +		goto free_copy_buff;
>  	}
>  
>  	q->cluster_cache = malloc(q->cluster_size);
> @@ -1163,6 +1299,9 @@ free_cluster_cache:
>  free_cluster_data:
>  	if (q->cluster_data)
>  		free(q->cluster_data);
> +free_copy_buff:
> +	if (q->cluster_data)
> +		free(q->cluster_data);

This looks like the wrong field.

>  free_header:
>  	if (q->header)
>  		free(q->header);
> @@ -1252,6 +1391,7 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
>  	q->version = QCOW1_VERSION;
>  	q->cluster_size = 1 << q->header->cluster_bits;
>  	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
> +	q->free_clust_idx = 0;
>  
>  	q->cluster_data = malloc(q->cluster_size);
>  	if (!q->cluster_data) {
> diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
> index bbf7913..e032a1e 100644
> --- a/tools/kvm/include/kvm/qcow.h
> +++ b/tools/kvm/include/kvm/qcow.h
> @@ -84,8 +84,10 @@ struct qcow {
>  	u32				version;
>  	u64				cluster_size;
>  	u64				cluster_offset_mask;
> +	u64				free_clust_idx;
>  	void				*cluster_cache;
>  	void				*cluster_data;
> +	void				*copy_buff;
>  };
>  
>  struct qcow1_header_disk {

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html