[PATCH v5] kvm tools, qcow: Add the support for copy-on-write cluster

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When meeting request to write the cluster without copied flag,
allocate a new cluster and write original data with modification
to the new cluster. This also adds support for the writing operation
of the qcow2 compressed image. After testing, image file can pass
through "qemu-img check". The performance is needed to be improved.

Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
---
 tools/kvm/disk/qcow.c        |  454 ++++++++++++++++++++++++++----------------
 tools/kvm/include/kvm/qcow.h |    2 +
 2 files changed, 289 insertions(+), 167 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 680b37d..e139fa5 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -20,6 +20,16 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+
+static inline int qcow_pwrite_sync(int fd,
+	void *buf, size_t count, off_t offset)
+{
+	if (pwrite_in_full(fd, buf, count, offset) < 0)
+		return -1;
+
+	return fdatasync(fd);
+}
+
 static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
 {
 	struct rb_node **link = &(root->rb_node), *parent = NULL;
@@ -101,7 +111,8 @@ static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
 
 	size = 1 << header->l2_bits;
 
-	if (pwrite_in_full(q->fd, c->table, size * sizeof(u64), c->offset) < 0)
+	if (qcow_pwrite_sync(q->fd, c->table,
+		size * sizeof(u64), c->offset) < 0)
 		return -1;
 
 	c->dirty = 0;
@@ -122,9 +133,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c)
 		 */
 		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
 
-		if (qcow_l2_cache_write(q, lru) < 0)
-			goto error;
-
 		/* Remove the node from the cache */
 		rb_erase(&lru->node, r);
 		list_del_init(&lru->list);
@@ -508,47 +516,6 @@ static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
 	return total;
 }
 
-static inline u64 file_size(int fd)
-{
-	struct stat st;
-
-	if (fstat(fd, &st) < 0)
-		return 0;
-
-	return st.st_size;
-}
-
-static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t offset)
-{
-	if (pwrite_in_full(fd, buf, count, offset) < 0)
-		return -1;
-
-	return fdatasync(fd);
-}
-
-/* Writes a level 2 table at the end of the file. */
-static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
-{
-	struct qcow_header *header = q->header;
-	u64 clust_sz;
-	u64 f_sz;
-	u64 off;
-	u64 sz;
-
-	f_sz		= file_size(q->fd);
-	if (!f_sz)
-		return 0;
-
-	sz		= 1 << header->l2_bits;
-	clust_sz	= 1 << header->cluster_bits;
-	off		= ALIGN(f_sz, clust_sz);
-
-	if (pwrite_in_full(q->fd, table, sz * sizeof(u64), off) < 0)
-		return 0;
-
-	return off;
-}
-
 static void refcount_table_free_cache(struct qcow_refcount_table *rft)
 {
 	struct rb_root *r = &rft->root;
@@ -601,7 +568,8 @@ static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
 	if (!rfb->dirty)
 		return 0;
 
-	if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0)
+	if (qcow_pwrite_sync(q->fd, rfb->entries,
+		rfb->size * sizeof(u16), rfb->offset) < 0)
 		return -1;
 
 	rfb->dirty = 0;
@@ -618,9 +586,6 @@ static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
 	if (rft->nr_cached == MAX_CACHE_NODES) {
 		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
 
-		if (write_refcount_block(q, lru) < 0)
-			goto error;
-
 		rb_erase(&lru->node, r);
 		list_del_init(&lru->list);
 		rft->nr_cached--;
@@ -706,6 +671,11 @@ static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64
 
 	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
 
+	if (!rfb_offset) {
+		pr_warning("Don't support to grow refcount table");
+		return NULL;
+	}
+
 	rfb = refcount_block_search(q, rfb_offset);
 	if (rfb)
 		return rfb;
@@ -728,35 +698,140 @@ error_free_rfb:
 	return NULL;
 }
 
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (!rfb) {
+		pr_warning("Error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+	if (rfb_idx >= rfb->size) {
+		pr_warning("L1: refcount block index out of bounds");
+		return -1;
+	}
+
+	return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u16 refcount;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (!rfb) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+	if (rfb_idx >= rfb->size) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	/* update free_clust_idx since refcount becomes zero */
+	if (!refcount && clust_idx < q->free_clust_idx)
+		q->free_clust_idx = clust_idx;
+
+	return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u64 start, end, offset;
+
+	start = clust_start & ~(q->cluster_size - 1);
+	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+	for (offset = start; offset <= end; offset += q->cluster_size)
+		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
 /*
- * QCOW file might grow during a write operation. Not only data but metadata is
- * also written at the end of the file. Therefore it is necessary to ensure
- * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
- * synchronize the in-core state of QCOW image to disk.
- *
- * We also try to restore the image to a consistent state if the metdata
- * operation fails. The two metadat operations are: level 1 and level 2 table
- * update. If either of them fails the image is truncated to a consistent state.
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
  */
-static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u16 clust_refcount;
+	u32 clust_idx, i;
+	u64 clust_num;
+
+	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+	for (i = 0; i < clust_num; i++) {
+		clust_idx = q->free_clust_idx++;
+		clust_refcount = qcow_get_refcount(q, clust_idx);
+		if (clust_refcount < 0)
+			return -1;
+		else if (clust_refcount > 0)
+			goto again;
+	}
+
+	for (i = 0; i < clust_num; i++)
+		if (update_cluster_refcount(q,
+			q->free_clust_idx - clust_num + i, 1))
+			return -1;
+
+	return (q->free_clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_header *header = q->header;
+
+	if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+		l1t->table_size * sizeof(u64),
+		header->l1_table_offset) < 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
 {
 	struct qcow_header *header = q->header;
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
-	u64 clust_start;
-	u64 clust_flags;
-	u64 l2t_offset;
-	u64 clust_off;
-	u64 l2t_size;
-	u64 clust_sz;
 	u64 l1t_idx;
+	u64 l2t_offset;
 	u64 l2t_idx;
-	u64 f_sz;
-	u64 len;
+	u64 l2t_size;
+	u64 l2t_new_offset;
 
-	l2t		= NULL;
-	l2t_size	= 1 << header->l2_bits;
-	clust_sz	= 1 << header->cluster_bits;
+	l2t_size = 1 << header->l2_bits;
 
 	l1t_idx = get_l1_index(q, offset);
 	if (l1t_idx >= l1t->table_size)
@@ -766,122 +841,164 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
 	if (l2t_idx >= l2t_size)
 		return -1;
 
-	clust_off = get_cluster_offset(q, offset);
-	if (clust_off >= clust_sz)
-		return -1;
-
-	len = clust_sz - clust_off;
-	if (len > src_len)
-		len = src_len;
-
-	mutex_lock(&q->mutex);
-
 	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
-	if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
-		pr_warning("compressed clusters are not supported");
-		goto error;
-	}
-	if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
-		pr_warning("L2 copy-on-write clusters are not supported");
-		goto error;
-	}
-
-	l2t_offset &= QCOW2_OFFSET_MASK;
-	if (l2t_offset) {
-		/* read and cache l2 table */
+	if (l2t_offset & QCOW2_OFLAG_COPIED) {
+		l2t_offset &= ~QCOW2_OFLAG_COPIED;
 		l2t = qcow_read_l2_table(q, l2t_offset);
 		if (!l2t)
 			goto error;
 	} else {
-		l2t = new_cache_table(q, l2t_offset);
-		if (!l2t)
+		l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
+		if (l2t_new_offset < 0)
 			goto error;
 
-		/* Capture the state of the consistent QCOW image */
-		f_sz = file_size(q->fd);
-		if (!f_sz)
-			goto free_cache;
+		l2t = new_cache_table(q, l2t_new_offset);
+		if (!l2t)
+			goto free_cluster;
+
+		if (l2t_offset) {
+			l2t = qcow_read_l2_table(q, l2t_offset);
+			if (!l2t)
+				goto free_cache;
+		} else
+			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
 
-		/* Write the l2 table of 0's at the end of the file */
-		l2t_offset = qcow_write_l2_table(q, l2t->table);
-		if (!l2t_offset)
+		/* write l2 table */
+		l2t->dirty = 1;
+		if (qcow_l2_cache_write(q, l2t) < 0)
 			goto free_cache;
 
-		if (cache_table(q, l2t) < 0) {
-			if (ftruncate(q->fd, f_sz) < 0)
-				goto free_cache;
+		/* cache l2 table */
+		if (cache_table(q, l2t))
+			goto free_cache;
 
+		/* update the l1 talble */
+		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+			| QCOW2_OFLAG_COPIED);
+		if (qcow_write_l1_table(q)) {
+			pr_warning("Update l1 table error");
 			goto free_cache;
 		}
 
-		/* Update the in-core entry */
-		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
+		/* free old cluster */
+		qcow_free_clusters(q, l2t_offset, q->cluster_size);
 	}
 
-	/* Capture the state of the consistent QCOW image */
-	f_sz		= file_size(q->fd);
-	if (!f_sz)
-		goto error;
+	*result_l2t = l2t;
+	*result_l2_index = l2t_idx;
 
-	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	return 0;
 
-	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
-	if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
-		pr_warning("compressed clusters are not supported");
-		goto error;
-	}
+free_cache:
+	free(l2t);
 
-	clust_start &= QCOW2_OFFSET_MASK;
-	if (!clust_start) {
-		clust_start		= ALIGN(f_sz, clust_sz);
-		l2t->table[l2t_idx]	= cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED);
-		l2t->dirty		= 1;
-	}
+free_cluster:
+	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
 
-	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
-		struct qcow_refcount_block *rfb = NULL;
-		u16 clust_refcount;
-		u64 clust_idx;
-		u64 rfb_idx;
+error:
+	return -1;
+}
 
-		clust_idx = (clust_start & QCOW2_OFFSET_MASK)
-			>> (header->cluster_bits);
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+		void *buf, u32 src_len)
+{
+	struct qcow_l2_table *l2t;
+	u64 clust_new_start;
+	u64 clust_start;
+	u64 clust_flags;
+	u64 clust_off;
+	u64 l2t_idx;
+	u64 len;
 
-		rfb = qcow_read_refcount_block(q, clust_idx);
-		if (!rfb) {
-			pr_warning("L1: error while reading refcount table");
-			goto error;
-		}
+	l2t = NULL;
 
-		rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
-		if (rfb_idx >= rfb->size) {
-			pr_warning("L1: refcount block index out of bounds");
+	clust_off = get_cluster_offset(q, offset);
+	if (clust_off >= q->cluster_size)
+		return -1;
+
+	len = q->cluster_size - clust_off;
+	if (len > src_len)
+		len = src_len;
+
+	mutex_lock(&q->mutex);
+
+	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+		pr_warning("Get l2 table error");
+		goto error;
+	}
+
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+	clust_start &= QCOW2_OFFSET_MASK;
+	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size);
+		if (clust_new_start < 0) {
+			pr_warning("Cluster alloc error");
 			goto error;
 		}
 
-		clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
-		if (!clust_refcount) {
-			clust_refcount = 1;
-			rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
-			rfb->dirty = 1;
-		}
+		offset &= ~(q->cluster_size - 1);
+
+		/* if clust_start is not zero, read the original data*/
+		if (clust_start) {
+			mutex_unlock(&q->mutex);
+			if (qcow2_read_cluster(q, offset, q->copy_buff,
+				q->cluster_size) < 0) {
+				pr_warning("Read copy cluster error");
+				qcow_free_clusters(q, clust_new_start,
+					q->cluster_size);
+				return -1;
+			}
+			mutex_lock(&q->mutex);
+		} else
+			memset(q->copy_buff, 0x00, q->cluster_size);
+
+		memcpy(q->copy_buff + clust_off, buf, len);
+
+		 /* Write actual data */
+		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+			clust_new_start) < 0)
+			goto free_cluster;
+
+		/* update l2 table*/
+		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+			| QCOW2_OFLAG_COPIED);
+		l2t->dirty = 1;
+
+		if (qcow_l2_cache_write(q, l2t))
+			goto free_cluster;
+
+		/* free old cluster*/
+		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+			int size;
+			size = ((clust_start >> q->csize_shift) &
+				q->csize_mask) + 1;
+			size *= 512;
+			clust_start &= q->cluster_offset_mask;
+			clust_start &= ~511;
+
+			qcow_free_clusters(q, clust_start, size);
+		} else if (clust_start)
+			qcow_free_clusters(q, clust_start, q->cluster_size);
 
-		if (clust_refcount > 1) {
-			pr_warning("L1 copy-on-write clusters are not supported");
+	} else {
+		/* Write actual data */
+		if (pwrite_in_full(q->fd, buf, len,
+			clust_start + clust_off) < 0)
 			goto error;
-		}
 	}
-
 	mutex_unlock(&q->mutex);
-
-	/* Write actual data */
-	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
-		return -1;
-
 	return len;
 
-free_cache:
-	free(l2t);
+free_cluster:
+	qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
 error:
 	mutex_unlock(&q->mutex);
 	return -1;
@@ -940,11 +1057,9 @@ static int qcow_disk_flush(struct disk_image *disk)
 {
 	struct qcow *q = disk->priv;
 	struct qcow_refcount_table *rft;
-	struct qcow_header *header;
 	struct list_head *pos, *n;
 	struct qcow_l1_table *l1t;
 
-	header = q->header;
 	l1t = &q->table;
 	rft = &q->refcount_table;
 
@@ -957,9 +1072,6 @@ static int qcow_disk_flush(struct disk_image *disk)
 			goto error_unlock;
 	}
 
-	if (fdatasync(disk->fd) < 0)
-		goto error_unlock;
-
 	list_for_each_safe(pos, n, &l1t->lru_list) {
 		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
 
@@ -967,10 +1079,7 @@ static int qcow_disk_flush(struct disk_image *disk)
 			goto error_unlock;
 	}
 
-	if (fdatasync(disk->fd) < 0)
-		goto error_unlock;
-
-	if (pwrite_in_full(disk->fd, l1t->l1_table, l1t->table_size * sizeof(u64), header->l1_table_offset) < 0)
+	if (qcow_write_l1_table < 0)
 		goto error_unlock;
 
 	mutex_unlock(&q->mutex);
@@ -993,6 +1102,7 @@ static int qcow_disk_close(struct disk_image *disk)
 
 	refcount_table_free_cache(&q->refcount_table);
 	l1_table_free_cache(&q->table);
+	free(q->copy_buff);
 	free(q->cluster_data);
 	free(q->cluster_cache);
 	free(q->refcount_table.rf_table);
@@ -1117,15 +1227,21 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
 	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
 	q->cluster_size = 1 << q->header->cluster_bits;
 
+	q->copy_buff = malloc(q->cluster_size);
+	if (!q->copy_buff) {
+		pr_warning("copy buff malloc error");
+		goto free_header;
+	}
+
 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
-		pr_warning("cluster data malloc error!");
-		goto free_header;
+		pr_warning("cluster data malloc error");
+		goto free_copy_buff;
 	}
 
 	q->cluster_cache = malloc(q->cluster_size);
 	if (!q->cluster_cache) {
-		pr_warning("cluster cache malloc error!");
+		pr_warning("cluster cache malloc error");
 		goto free_cluster_data;
 	}
 
@@ -1163,6 +1279,9 @@ free_cluster_cache:
 free_cluster_data:
 	if (q->cluster_data)
 		free(q->cluster_data);
+free_copy_buff:
+	if (q->copy_buff)
+		free(q->copy_buff);
 free_header:
 	if (q->header)
 		free(q->header);
@@ -1252,16 +1371,17 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
 	q->version = QCOW1_VERSION;
 	q->cluster_size = 1 << q->header->cluster_bits;
 	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+	q->free_clust_idx = 0;
 
 	q->cluster_data = malloc(q->cluster_size);
 	if (!q->cluster_data) {
-		pr_warning("cluster data malloc error!");
+		pr_warning("cluster data malloc error");
 		goto free_header;
 	}
 
 	q->cluster_cache = malloc(q->cluster_size);
 	if (!q->cluster_cache) {
-		pr_warning("cluster cache malloc error!");
+		pr_warning("cluster cache malloc error");
 		goto free_cluster_data;
 	}
 
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index bbf7913..e032a1e 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -84,8 +84,10 @@ struct qcow {
 	u32				version;
 	u64				cluster_size;
 	u64				cluster_offset_mask;
+	u64				free_clust_idx;
 	void				*cluster_cache;
 	void				*cluster_data;
+	void				*copy_buff;
 };
 
 struct qcow1_header_disk {
-- 
1.7.6.rc2.8.g28eb

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux