Am 18.11.2011 09:47, schrieb Lan Tianyu: > When meeting request to write the cluster without copied flag, > allocate a new cluster and write original data with modification > to the new cluster. This also adds support for the writing operation > of the qcow2 compressed image. After testing, image file can pass > through "qemu-img check". > > Please enter the commit message for your changes. Lines starting > > Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx> > --- > tools/kvm/disk/qcow.c | 366 +++++++++++++++++++++++++++++------------- > tools/kvm/include/kvm/qcow.h | 2 + > 2 files changed, 255 insertions(+), 113 deletions(-) > > diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c > index 680b37d..4d9125d 100644 > --- a/tools/kvm/disk/qcow.c > +++ b/tools/kvm/disk/qcow.c > @@ -122,9 +122,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c) > */ > lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list); > > - if (qcow_l2_cache_write(q, lru) < 0) > - goto error; > - > /* Remove the node from the cache */ > rb_erase(&lru->node, r); > list_del_init(&lru->list); > @@ -618,9 +615,6 @@ static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c) > if (rft->nr_cached == MAX_CACHE_NODES) { > lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list); > > - if (write_refcount_block(q, lru) < 0) > - goto error; > - > rb_erase(&lru->node, r); > list_del_init(&lru->list); > rft->nr_cached--; > @@ -706,6 +700,11 @@ static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 > > rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]); > > + if (!rfb_offset) { > + pr_warning("Don't support to grow refcount table"); > + return NULL; > + } > + > rfb = refcount_block_search(q, rfb_offset); > if (rfb) > return rfb; > @@ -728,35 +727,121 @@ error_free_rfb: > return NULL; > } > > -/* > - * QCOW file might grow during a write operation. Not only data but metadata is > - * also written at the end of the file. Therefore it is necessary to ensure > - * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to > - * synchronize the in-core state of QCOW image to disk. > - * > - * We also try to restore the image to a consistent state if the metdata > - * operation fails. The two metadat operations are: level 1 and level 2 table > - * update. If either of them fails the image is truncated to a consistent state. > +static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx) > +{ > + struct qcow_refcount_block *rfb = NULL; > + struct qcow_header *header = q->header; > + u64 rfb_idx; > + > + rfb = qcow_read_refcount_block(q, clust_idx); > + if (!rfb) { > + pr_warning("Error while reading refcount table"); > + return -1; > + } > + > + rfb_idx = clust_idx & (((1ULL << > + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); > + > + if (rfb_idx >= rfb->size) { > + pr_warning("L1: refcount block index out of bounds"); > + return -1; > + } > + > + return be16_to_cpu(rfb->entries[rfb_idx]); > +} > + > +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append) > +{ > + struct qcow_refcount_block *rfb = NULL; > + struct qcow_header *header = q->header; > + u16 refcount; > + u64 rfb_idx; > + > + rfb = qcow_read_refcount_block(q, clust_idx); > + if (!rfb) { > + pr_warning("error while reading refcount table"); > + return -1; > + } > + > + rfb_idx = clust_idx & (((1ULL << > + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); > + if (rfb_idx >= rfb->size) { > + pr_warning("refcount block index out of bounds"); > + return -1; > + } > + > + refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append; > + rfb->entries[rfb_idx] = cpu_to_be16(refcount); > + rfb->dirty = 1; > + > + /*write refcount block*/ > + write_refcount_block(q, rfb); Missing error handling. > + > + /*update free_clust_idx since refcount becomes zero*/ > + if (!refcount && clust_idx < q->free_clust_idx) > + q->free_clust_idx = clust_idx; > + > + return 0; > +} > + > +static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size) > +{ > + struct qcow_header *header = q->header; > + u64 start, end, offset; > + > + start = clust_start & ~(q->cluster_size - 1); > + end = (clust_start + size - 1) & ~(q->cluster_size - 1); > + for (offset = start; offset <= end; offset += q->cluster_size) > + update_cluster_refcount(q, offset >> header->cluster_bits, -1); > +} > + > +/*Allocate clusters according to the size. Find a postion that > + *can satisfy the size. free_clust_idx is initialized to zero and > + *Record last position. > +*/ > +static u64 qcow_alloc_clusters(struct qcow *q, u64 size) > +{ > + struct qcow_header *header = q->header; > + u16 clust_refcount; > + u32 clust_idx, i; > + u64 clust_num; > + > + clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits; > + > +again: > + for (i = 0; i < clust_num; i++) { > + clust_idx = q->free_clust_idx++; > + clust_refcount = qcow_get_refcount(q, clust_idx); > + if (clust_refcount < 0) > + return -1; > + else if (clust_refcount > 0) > + goto again; > + } > + > + for (i = 0; i < clust_num; i++) > + update_cluster_refcount(q, > + q->free_clust_idx - clust_num + i, 1); Error handling again. > + > + return (q->free_clust_idx - clust_num) << header->cluster_bits; > +} > + > +/*Get l2 table. If the table has been copied, read table directly. > + *If the table exists, allocate a new cluster and copy the table > + *to the new cluster. > */ > -static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len) > +static int get_cluster_table(struct qcow *q, u64 offset, > + struct qcow_l2_table **result_l2t, u64 *result_l2_index) > { > struct qcow_header *header = q->header; > struct qcow_l1_table *l1t = &q->table; > struct qcow_l2_table *l2t; > - u64 clust_start; > - u64 clust_flags; > - u64 l2t_offset; > - u64 clust_off; > - u64 l2t_size; > - u64 clust_sz; > u64 l1t_idx; > + u64 l2t_offset; > u64 l2t_idx; > - u64 f_sz; > - u64 len; > + u64 l2t_size; > + u64 l2t_new_offset; > > - l2t = NULL; > - l2t_size = 1 << header->l2_bits; > - clust_sz = 1 << header->cluster_bits; > + l2t_size = 1 << header->l2_bits; > > l1t_idx = get_l1_index(q, offset); > if (l1t_idx >= l1t->table_size) > @@ -766,122 +851,166 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src > if (l2t_idx >= l2t_size) > return -1; > > - clust_off = get_cluster_offset(q, offset); > - if (clust_off >= clust_sz) > - return -1; > - > - len = clust_sz - clust_off; > - if (len > src_len) > - len = src_len; > - > - mutex_lock(&q->mutex); > - > l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]); > - if (l2t_offset & QCOW2_OFLAG_COMPRESSED) { > - pr_warning("compressed clusters are not supported"); > - goto error; > - } > - if (!(l2t_offset & QCOW2_OFLAG_COPIED)) { > - pr_warning("L2 copy-on-write clusters are not supported"); > - goto error; > - } > - > - l2t_offset &= QCOW2_OFFSET_MASK; > - if (l2t_offset) { > - /* read and cache l2 table */ > + if (l2t_offset & QCOW2_OFLAG_COPIED) { > + l2t_offset &= ~QCOW2_OFLAG_COPIED; > l2t = qcow_read_l2_table(q, l2t_offset); > if (!l2t) > goto error; > } else { > - l2t = new_cache_table(q, l2t_offset); > - if (!l2t) > + l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64)); > + if (l2t_new_offset < 0) > goto error; > > - /* Capture the state of the consistent QCOW image */ > - f_sz = file_size(q->fd); > - if (!f_sz) > - goto free_cache; > + l2t = new_cache_table(q, l2t_new_offset); > + if (!l2t) > + goto free_cluster; > + > + if (l2t_offset) > + qcow2_read_cluster(q, l2t_offset, l2t->table, > + l2t_size*sizeof(u64)); There must be a system behind it. :-) > + else > + memset(l2t->table, 0x00, l2t_size * sizeof(u64)); > > - /* Write the l2 table of 0's at the end of the file */ > - l2t_offset = qcow_write_l2_table(q, l2t->table); > - if (!l2t_offset) > + /*write l2 table*/ > + l2t->dirty = 1; > + if (qcow_l2_cache_write(q, l2t) < 0) > goto free_cache; You need to make sure that the refcount update is written first (e.g. with fsync), otherwise you risk corruption when the host crashes in the middle. > > - if (cache_table(q, l2t) < 0) { > - if (ftruncate(q->fd, f_sz) < 0) > - goto free_cache; > + /* Update the l1 talble */ > + l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset > + | QCOW2_OFLAG_COPIED); > > - goto free_cache; > - } > + if (pwrite_in_full(q->fd, l1t->l1_table, > + l1t->table_size * sizeof(u64), > + header->l1_table_offset) < 0) > + goto error; Likewise, the L1 table write must be ordered against the L2 write. goto error is using the wrong label. > > - /* Update the in-core entry */ > - l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset); > + /*cache l2 table*/ > + cache_table(q, l2t); After so many explicit comments, you can probably guess what's wrong here... > + > + /*free old cluster*/ > + qcow_free_clusters(q, l2t_offset, q->cluster_size); > } > > - /* Capture the state of the consistent QCOW image */ > - f_sz = file_size(q->fd); > - if (!f_sz) > - goto error; > + *result_l2t = l2t; > + *result_l2_index = l2t_idx; > > - clust_start = be64_to_cpu(l2t->table[l2t_idx]); > + return 0; > > - clust_flags = clust_start & QCOW2_OFLAGS_MASK; > - if (clust_flags & QCOW2_OFLAG_COMPRESSED) { > - pr_warning("compressed clusters are not supported"); > +free_cache: > + free(l2t); > + > +free_cluster: > + qcow_free_clusters(q, l2t_new_offset, q->cluster_size); > + > +error: > + return -1; > +} > + > +/*If the cluster has been copied, write data directly. If not, > + *read the original data and write it to the new cluster with > + *modification. > +*/ > +static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, > + void *buf, u32 src_len) > +{ > + struct qcow_header *header = q->header; > + struct qcow_l2_table *l2t; > + u64 clust_new_idx; > + u64 clust_new_start; > + u64 clust_start; > + u64 clust_flags; > + u64 clust_off; > + u64 l2t_idx; > + u64 len; > + > + l2t = NULL; > + > + clust_off = get_cluster_offset(q, offset); > + if (clust_off >= q->cluster_size) > + return -1; > + > + len = q->cluster_size - clust_off; > + if (len > src_len) > + len = src_len; > + > + mutex_lock(&q->mutex); > + > + if (get_cluster_table(q, offset, &l2t, &l2t_idx)) { > + pr_warning("Get l2 table error"); > goto error; > } > > - clust_start &= QCOW2_OFFSET_MASK; > - if (!clust_start) { > - clust_start = ALIGN(f_sz, clust_sz); > - l2t->table[l2t_idx] = cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED); > - l2t->dirty = 1; > - } > + clust_start = be64_to_cpu(l2t->table[l2t_idx]); > + clust_flags = clust_start & QCOW2_OFLAGS_MASK; > > + clust_start &= QCOW2_OFFSET_MASK; > if (!(clust_flags & QCOW2_OFLAG_COPIED)) { > - struct qcow_refcount_block *rfb = NULL; > - u16 clust_refcount; > - u64 clust_idx; > - u64 rfb_idx; > - > - clust_idx = (clust_start & QCOW2_OFFSET_MASK) > - >> (header->cluster_bits); > > - rfb = qcow_read_refcount_block(q, clust_idx); > - if (!rfb) { > - pr_warning("L1: error while reading refcount table"); > + clust_new_start = qcow_alloc_clusters(q, q->cluster_size); > + if (clust_new_start < 0) { > + pr_warning("Cluster alloc error!"); > goto error; > } > > - rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); > - if (rfb_idx >= rfb->size) { > - pr_warning("L1: refcount block index out of bounds"); > - goto error; > - } > + clust_new_idx = clust_new_start >> header->cluster_bits; > + offset &= ~(q->cluster_size - 1); > + > + /*if clust_start is not zero, read the original data*/ > + if (clust_start) { > + mutex_unlock(&q->mutex); > + if (qcow2_read_cluster(q, offset, q->copy_buff, > + q->cluster_size) < 0) { > + pr_warning("Read copy cluster error"); > + qcow_free_clusters(q, clust_new_start, > + q->cluster_size); > + return -1; > + } > + mutex_lock(&q->mutex); > + } else > + memset(q->copy_buff, 0x00, q->cluster_size); > + > + memcpy(q->copy_buff + clust_off, buf, len); > + > + /* Write actual data */ > + if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size, > + clust_new_start) < 0) > + goto free_cluster; > + > + /*update l2 table*/ > + l2t->table[l2t_idx] = cpu_to_be64(clust_new_start > + | QCOW2_OFLAG_COPIED); > + l2t->dirty = 1; > + > + if (qcow_l2_cache_write(q, l2t)) > + goto free_cluster; Cluster allocation must be ordered against L2 update. > + > + /*free old cluster*/ > + if (clust_flags & QCOW2_OFLAG_COMPRESSED) { > + int size; > + size = ((clust_start >> q->csize_shift) & > + q->csize_mask) + 1; > + size *= 512; > + clust_start &= q->cluster_offset_mask; > + clust_start &= ~511; > + > + qcow_free_clusters(q, clust_start, size); > + } else if (clust_start) > + qcow_free_clusters(q, clust_start, q->cluster_size); > > - clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]); > - if (!clust_refcount) { > - clust_refcount = 1; > - rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount); > - rfb->dirty = 1; > - } > - > - if (clust_refcount > 1) { > - pr_warning("L1 copy-on-write clusters are not supported"); > + } else { > + /* Write actual data */ > + if (pwrite_in_full(q->fd, buf, len, > + clust_start + clust_off) < 0) > goto error; > - } > } > - > mutex_unlock(&q->mutex); > - > - /* Write actual data */ > - if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0) > - return -1; > - > return len; > > -free_cache: > - free(l2t); > +free_cluster: > + qcow_free_clusters(q, clust_new_start, q->cluster_size); > + > error: > mutex_unlock(&q->mutex); > return -1; > @@ -993,6 +1122,7 @@ static int qcow_disk_close(struct disk_image *disk) > > refcount_table_free_cache(&q->refcount_table); > l1_table_free_cache(&q->table); > + free(q->copy_buff); > free(q->cluster_data); > free(q->cluster_cache); > free(q->refcount_table.rf_table); > @@ -1117,10 +1247,16 @@ static struct disk_image *qcow2_probe(int fd, bool readonly) > q->cluster_offset_mask = (1LL << q->csize_shift) - 1; > q->cluster_size = 1 << q->header->cluster_bits; > > + q->copy_buff = malloc(q->cluster_size); > + if (!q->copy_buff) { > + pr_warning("copy buff malloc error!"); > + goto free_header; > + } > + > q->cluster_data = malloc(q->cluster_size); > if (!q->cluster_data) { > pr_warning("cluster data malloc error!"); > - goto free_header; > + goto free_copy_buff; > } > > q->cluster_cache = malloc(q->cluster_size); > @@ -1163,6 +1299,9 @@ free_cluster_cache: > free_cluster_data: > if (q->cluster_data) > free(q->cluster_data); > +free_copy_buff: > + if (q->cluster_data) > + free(q->cluster_data); This looks like the wrong field. > free_header: > if (q->header) > free(q->header); > @@ -1252,6 +1391,7 @@ static struct disk_image *qcow1_probe(int fd, bool readonly) > q->version = QCOW1_VERSION; > q->cluster_size = 1 << q->header->cluster_bits; > q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1; > + q->free_clust_idx = 0; > > q->cluster_data = malloc(q->cluster_size); > if (!q->cluster_data) { > diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h > index bbf7913..e032a1e 100644 > --- a/tools/kvm/include/kvm/qcow.h > +++ b/tools/kvm/include/kvm/qcow.h > @@ -84,8 +84,10 @@ struct qcow { > u32 version; > u64 cluster_size; > u64 cluster_offset_mask; > + u64 free_clust_idx; > void *cluster_cache; > void *cluster_data; > + void *copy_buff; > }; > > struct qcow1_header_disk { Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html