[RFC PATCH] kvm tools, qcow: Add the support for copy-on-write clusters

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When meeting request to write the cluster without copied flag,
allocate a new cluster and write original data with modification
to the new cluster. This also can add support for the writing
operation of the qcow2 compressed image.

Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
---
 tools/kvm/disk/qcow.c        |  322 ++++++++++++++++++++++++++++--------------
 tools/kvm/include/kvm/qcow.h |    2 +
 2 files changed, 218 insertions(+), 106 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 680b37d..2b9af73 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -122,9 +122,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c)
 		 */
 		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
 
-		if (qcow_l2_cache_write(q, lru) < 0)
-			goto error;
-
 		/* Remove the node from the cache */
 		rb_erase(&lru->node, r);
 		list_del_init(&lru->list);
@@ -728,35 +725,110 @@ error_free_rfb:
 	return NULL;
 }
 
-/*
- * QCOW file might grow during a write operation. Not only data but metadata is
- * also written at the end of the file. Therefore it is necessary to ensure
- * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
- * synchronize the in-core state of QCOW image to disk.
- *
- * We also try to restore the image to a consistent state if the metdata
- * operation fails. The two metadat operations are: level 1 and level 2 table
- * update. If either of them fails the image is truncated to a consistent state.
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (!rfb) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+	if (rfb_idx >= rfb->size) {
+		pr_warning("L1: refcount block index out of bounds");
+		return -1;
+	}
+
+	return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u16 refcount;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (!rfb) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+	if (rfb_idx >= rfb->size) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+	rfb->dirty = 1;
+
+	/*write refcount block*/
+	write_refcount_block(q, rfb);
+
+	/*update free_clust_idx since refcount becomes zero*/
+	if (!refcount && clust_idx < q->free_clust_idx)
+		q->free_clust_idx = clust_idx;
+
+	return 0;
+}
+
+/*Allocate clusters according to the size. Find a postion that
+ *can satisfy the size. free_clust_idx is initialized to zero and
+ *Record last position.
+*/
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u16 clust_refcount;
+	u32 clust_idx, i;
+	u64 clust_num;
+
+	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+	for (i = 0; i < clust_num; i++) {
+		clust_idx = q->free_clust_idx++;
+		clust_refcount = qcow_get_refcount(q, clust_idx);
+		if (clust_refcount < 0)
+			return -1;
+		else if (clust_refcount > 0)
+			goto again;
+	}
+
+	for (i = 0; i < clust_num; i++)
+		update_cluster_refcount(q,
+			q->free_clust_idx - clust_num + i, 1);
+
+	return (q->free_clust_idx - clust_num) << header->cluster_bits;
+}
+
+/*Get l2 table. If the table has been copied, read table directly.
+ *If the table exists, allocate a new cluster and copy the table
+ *to the new cluster.
  */
-static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
+static int get_cluster_table(struct qcow *q, u64 offset,
+	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
-	u64 clust_start;
-	u64 clust_flags;
-	u64 l2t_offset;
-	u64 clust_off;
-	u64 l2t_size;
-	u64 clust_sz;
 	u64 l1t_idx;
+	u64 l2t_offset;
 	u64 l2t_idx;
-	u64 f_sz;
-	u64 len;
+	u64 l2t_size;
+	u64 l2t_new_offset;
 
-	l2t		= NULL;
-	l2t_size	= 1 << header->l2_bits;
-	clust_sz	= 1 << header->cluster_bits;
+	l2t_size = 1 << header->l2_bits;
 
 	l1t_idx = get_l1_index(q, offset);
 	if (l1t_idx >= l1t->table_size)
@@ -766,122 +838,149 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
 	if (l2t_idx >= l2t_size)
 		return -1;
 
-	clust_off = get_cluster_offset(q, offset);
-	if (clust_off >= clust_sz)
-		return -1;
-
-	len = clust_sz - clust_off;
-	if (len > src_len)
-		len = src_len;
-
-	mutex_lock(&q->mutex);
-
 	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
-	if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
-		pr_warning("compressed clusters are not supported");
-		goto error;
-	}
-	if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
-		pr_warning("L2 copy-on-write clusters are not supported");
-		goto error;
-	}
-
-	l2t_offset &= QCOW2_OFFSET_MASK;
-	if (l2t_offset) {
-		/* read and cache l2 table */
+	if (l2t_offset & QCOW2_OFLAG_COPIED) {
+		l2t_offset &= ~QCOW2_OFLAG_COPIED;
 		l2t = qcow_read_l2_table(q, l2t_offset);
 		if (!l2t)
 			goto error;
 	} else {
-		l2t = new_cache_table(q, l2t_offset);
+		l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
+		if (l2t_new_offset < 0)
+			goto error;
+
+		l2t = new_cache_table(q, l2t_new_offset);
 		if (!l2t)
 			goto error;
 
-		/* Capture the state of the consistent QCOW image */
-		f_sz = file_size(q->fd);
-		if (!f_sz)
-			goto free_cache;
+		if (l2t_offset)
+			qcow2_read_cluster(q, l2t_offset, l2t->table,
+				l2t_size*sizeof(u64));
+		else
+			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
 
-		/* Write the l2 table of 0's at the end of the file */
-		l2t_offset = qcow_write_l2_table(q, l2t->table);
-		if (!l2t_offset)
+		/*write l2 table*/
+		l2t->dirty = 1;
+		if (qcow_l2_cache_write(q, l2t) < 0)
 			goto free_cache;
 
-		if (cache_table(q, l2t) < 0) {
-			if (ftruncate(q->fd, f_sz) < 0)
-				goto free_cache;
+		/*cache l2 table*/
+		cache_table(q, l2t);
 
-			goto free_cache;
-		}
+		/* Update the l1 talble */
+		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+			| QCOW2_OFLAG_COPIED);
 
-		/* Update the in-core entry */
-		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
+		if (pwrite_in_full(q->fd, l1t->l1_table,
+			l1t->table_size * sizeof(u64),
+			header->l1_table_offset) < 0)
+			goto error;
 	}
 
-	/* Capture the state of the consistent QCOW image */
-	f_sz		= file_size(q->fd);
-	if (!f_sz)
-		goto error;
+	*result_l2t = l2t;
+	*result_l2_index = l2t_idx;
 
-	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	return 0;
 
-	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
-	if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
-		pr_warning("compressed clusters are not supported");
+free_cache:
+	free(l2t);
+error:
+	return -1;
+}
+
+/*If the cluster has been copied, write data directly. If not,
+ *read the original data and write it to the new cluster with
+ *modification.
+*/
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+		void *buf, u32 src_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *l2t;
+	u64 clust_start;
+	u64 clust_flags;
+	u64 clust_off;
+	u64 l2t_idx;
+	u64 len;
+
+	l2t = NULL;
+
+	clust_off = get_cluster_offset(q, offset);
+	if (clust_off >= q->cluster_size)
+		return -1;
+
+	len = q->cluster_size - clust_off;
+	if (len > src_len)
+		len = src_len;
+
+	mutex_lock(&q->mutex);
+
+	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+		pr_warning("Get l2 table error");
 		goto error;
 	}
 
-	clust_start &= QCOW2_OFFSET_MASK;
-	if (!clust_start) {
-		clust_start		= ALIGN(f_sz, clust_sz);
-		l2t->table[l2t_idx]	= cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED);
-		l2t->dirty		= 1;
-	}
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
 
+	clust_start &= QCOW2_OFFSET_MASK;
 	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
-		struct qcow_refcount_block *rfb = NULL;
-		u16 clust_refcount;
-		u64 clust_idx;
-		u64 rfb_idx;
+		u64 clust_new_idx;
+		u64 clust_new_start;
 
-		clust_idx = (clust_start & QCOW2_OFFSET_MASK)
-			>> (header->cluster_bits);
-
-		rfb = qcow_read_refcount_block(q, clust_idx);
-		if (!rfb) {
-			pr_warning("L1: error while reading refcount table");
+		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size);
+		if (clust_new_start < 0) {
+			pr_warning("Cluster alloc error!");
 			goto error;
 		}
 
-		rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
-		if (rfb_idx >= rfb->size) {
-			pr_warning("L1: refcount block index out of bounds");
+		clust_new_idx = clust_new_start >> header->cluster_bits;
+		offset &= ~(q->cluster_size - 1);
+
+		/*if clust_start is not zero, read the original data*/
+		if (clust_start) {
+			mutex_unlock(&q->mutex);
+			qcow2_read_cluster(q, offset, q->copy_buff,
+				q->cluster_size);
+			mutex_lock(&q->mutex);
+		} else
+			memset(q->copy_buff, 0x00, q->cluster_size);
+
+		memcpy(q->copy_buff + clust_off, buf, len);
+
+		 /* Write actual data */
+		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+			clust_new_start) < 0)
 			goto error;
-		}
 
-		clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
-		if (!clust_refcount) {
-			clust_refcount = 1;
-			rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
-			rfb->dirty = 1;
+		/*update l2 table*/
+		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+			| QCOW2_OFLAG_COPIED);
+		l2t->dirty = 1;
+		qcow_l2_cache_write(q, l2t);
+
+		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+			clust_start &= q->cluster_offset_mask;
+			clust_start &= ~511;
 		}
 
-		if (clust_refcount > 1) {
-			pr_warning("L1 copy-on-write clusters are not supported");
+		/*update reference count. Reduce the refcount of the old cluster
+		 * and increase the one of the new cluster
+		 */
+		if (clust_start)
+			update_cluster_refcount(q,
+				(clust_start >> header->cluster_bits), -1);
+
+		update_cluster_refcount(q, clust_new_idx, 1);
+	} else {
+		/* Write actual data */
+		if (pwrite_in_full(q->fd, buf, len,
+			clust_start + clust_off) < 0)
 			goto error;
-		}
 	}
-
 	mutex_unlock(&q->mutex);
-
-	/* Write actual data */
-	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
-		return -1;
-
 	return len;
 
-free_cache:
-	free(l2t);
 error:
 	mutex_unlock(&q->mutex);
 	return -1;
@@ -993,6 +1092,7 @@ static int qcow_disk_close(struct disk_image *disk)
 
 	refcount_table_free_cache(&q->refcount_table);
 	l1_table_free_cache(&q->table);
+	free(q->copy_buff);
 	free(q->cluster_data);
 	free(q->cluster_cache);
 	free(q->refcount_table.rf_table);
@@ -1117,10 +1217,16 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
 	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
 	q->cluster_size = 1 << q->header->cluster_bits;
 
+	q->copy_buff = malloc(q->cluster_size);
+	if (!q->copy_buff) {
+		pr_warning("copy buff malloc error!");
+		goto free_header;
+	}
+
 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
 		pr_warning("cluster data malloc error!");
-		goto free_header;
+		goto free_copy_buff;
 	}
 
 	q->cluster_cache = malloc(q->cluster_size);
@@ -1163,6 +1269,9 @@ free_cluster_cache:
 free_cluster_data:
 	if (q->cluster_data)
 		free(q->cluster_data);
+free_copy_buff:
+	if (q->cluster_data)
+		free(q->cluster_data);
 free_header:
 	if (q->header)
 		free(q->header);
@@ -1252,6 +1361,7 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
 	q->version = QCOW1_VERSION;
 	q->cluster_size = 1 << q->header->cluster_bits;
 	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+	q->free_clust_idx = 0;
 
 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index bbf7913..e032a1e 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -84,8 +84,10 @@ struct qcow {
 	u32				version;
 	u64				cluster_size;
 	u64				cluster_offset_mask;
+	u64				free_clust_idx;
 	void				*cluster_cache;
 	void				*cluster_data;
+	void				*copy_buff;
 };
 
 struct qcow1_header_disk {
-- 
1.7.6.rc2.8.g28eb

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux