kvm tools, qcow: Add the support for copy-on-write cluster

author Lan Tianyu <tianyu.lan@intel.com>

Tue, 29 Nov 2011 07:30:26 +0000 (15:30 +0800)

committer Pekka Enberg <penberg@kernel.org>

Tue, 29 Nov 2011 18:07:12 +0000 (20:07 +0200)
author Lan Tianyu <tianyu.lan@intel.com>
Tue, 29 Nov 2011 07:30:26 +0000 (15:30 +0800)
committer Pekka Enberg <penberg@kernel.org>
Tue, 29 Nov 2011 18:07:12 +0000 (20:07 +0200)
diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c

index 680b37d01489fe19afb9fddc3d427cf29592a024..e139fa57457fa6ec3736e9dd696166a8b8ab630d 100644 (file)
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -20,6 +20,16 @@
  #include <linux/kernel.h>
  #include <linux/types.h>
  
+
+static inline int qcow_pwrite_sync(int fd,
+       void *buf, size_t count, off_t offset)
+{
+       if (pwrite_in_full(fd, buf, count, offset) < 0)
+               return -1;
+
+       return fdatasync(fd);
+}
+
  static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
  {
         struct rb_node **link = &(root->rb_node), *parent = NULL;
@@ -101,7 +111,8 @@ static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
  
         size = 1 << header->l2_bits;
  
-       if (pwrite_in_full(q->fd, c->table, size * sizeof(u64), c->offset) < 0)
+       if (qcow_pwrite_sync(q->fd, c->table,
+               size * sizeof(u64), c->offset) < 0)
                 return -1;
  
         c->dirty = 0;
@@ -122,9 +133,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table *c)
                  */
                 lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
  
-               if (qcow_l2_cache_write(q, lru) < 0)
-                       goto error;
-
                 /* Remove the node from the cache */
                 rb_erase(&lru->node, r);
                 list_del_init(&lru->list);
@@ -508,47 +516,6 @@ static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
         return total;
  }
  
-static inline u64 file_size(int fd)
-{
-       struct stat st;
-
-       if (fstat(fd, &st) < 0)
-               return 0;
-
-       return st.st_size;
-}
-
-static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t offset)
-{
-       if (pwrite_in_full(fd, buf, count, offset) < 0)
-               return -1;
-
-       return fdatasync(fd);
-}
-
-/* Writes a level 2 table at the end of the file. */
-static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
-{
-       struct qcow_header *header = q->header;
-       u64 clust_sz;
-       u64 f_sz;
-       u64 off;
-       u64 sz;
-
-       f_sz            = file_size(q->fd);
-       if (!f_sz)
-               return 0;
-
-       sz              = 1 << header->l2_bits;
-       clust_sz        = 1 << header->cluster_bits;
-       off             = ALIGN(f_sz, clust_sz);
-
-       if (pwrite_in_full(q->fd, table, sz * sizeof(u64), off) < 0)
-               return 0;
-
-       return off;
-}
-
  static void refcount_table_free_cache(struct qcow_refcount_table *rft)
  {
         struct rb_root *r = &rft->root;
@@ -601,7 +568,8 @@ static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
         if (!rfb->dirty)
                 return 0;
  
-       if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0)
+       if (qcow_pwrite_sync(q->fd, rfb->entries,
+               rfb->size * sizeof(u16), rfb->offset) < 0)
                 return -1;
  
         rfb->dirty = 0;
@@ -618,9 +586,6 @@ static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
         if (rft->nr_cached == MAX_CACHE_NODES) {
                 lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
  
-               if (write_refcount_block(q, lru) < 0)
-                       goto error;
-
                 rb_erase(&lru->node, r);
                 list_del_init(&lru->list);
                 rft->nr_cached--;
@@ -706,6 +671,11 @@ static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64
  
         rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
  
+       if (!rfb_offset) {
+               pr_warning("Don't support to grow refcount table");
+               return NULL;
+       }
+
         rfb = refcount_block_search(q, rfb_offset);
         if (rfb)
                 return rfb;
@@ -728,35 +698,140 @@ error_free_rfb:
         return NULL;
  }
  
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+       struct qcow_refcount_block *rfb = NULL;
+       struct qcow_header *header = q->header;
+       u64 rfb_idx;
+
+       rfb = qcow_read_refcount_block(q, clust_idx);
+       if (!rfb) {
+               pr_warning("Error while reading refcount table");
+               return -1;
+       }
+
+       rfb_idx = clust_idx & (((1ULL <<
+               (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+       if (rfb_idx >= rfb->size) {
+               pr_warning("L1: refcount block index out of bounds");
+               return -1;
+       }
+
+       return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+       struct qcow_refcount_block *rfb = NULL;
+       struct qcow_header *header = q->header;
+       u16 refcount;
+       u64 rfb_idx;
+
+       rfb = qcow_read_refcount_block(q, clust_idx);
+       if (!rfb) {
+               pr_warning("error while reading refcount table");
+               return -1;
+       }
+
+       rfb_idx = clust_idx & (((1ULL <<
+               (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+       if (rfb_idx >= rfb->size) {
+               pr_warning("refcount block index out of bounds");
+               return -1;
+       }
+
+       refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+       rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+       rfb->dirty = 1;
+
+       /* write refcount block */
+       if (write_refcount_block(q, rfb) < 0) {
+               pr_warning("refcount block index out of bounds");
+               return -1;
+       }
+
+       /* update free_clust_idx since refcount becomes zero */
+       if (!refcount && clust_idx < q->free_clust_idx)
+               q->free_clust_idx = clust_idx;
+
+       return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+       struct qcow_header *header = q->header;
+       u64 start, end, offset;
+
+       start = clust_start & ~(q->cluster_size - 1);
+       end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+       for (offset = start; offset <= end; offset += q->cluster_size)
+               update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
  /*
- * QCOW file might grow during a write operation. Not only data but metadata is
- * also written at the end of the file. Therefore it is necessary to ensure
- * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
- * synchronize the in-core state of QCOW image to disk.
- *
- * We also try to restore the image to a consistent state if the metdata
- * operation fails. The two metadat operations are: level 1 and level 2 table
- * update. If either of them fails the image is truncated to a consistent state.
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
   */
-static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
+{
+       struct qcow_header *header = q->header;
+       u16 clust_refcount;
+       u32 clust_idx, i;
+       u64 clust_num;
+
+       clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+       for (i = 0; i < clust_num; i++) {
+               clust_idx = q->free_clust_idx++;
+               clust_refcount = qcow_get_refcount(q, clust_idx);
+               if (clust_refcount < 0)
+                       return -1;
+               else if (clust_refcount > 0)
+                       goto again;
+       }
+
+       for (i = 0; i < clust_num; i++)
+               if (update_cluster_refcount(q,
+                       q->free_clust_idx - clust_num + i, 1))
+                       return -1;
+
+       return (q->free_clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_header *header = q->header;
+
+       if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+               l1t->table_size * sizeof(u64),
+               header->l1_table_offset) < 0)
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+       struct qcow_l2_table **result_l2t, u64 *result_l2_index)
  {
         struct qcow_header *header = q->header;
         struct qcow_l1_table *l1t = &q->table;
         struct qcow_l2_table *l2t;
-       u64 clust_start;
-       u64 clust_flags;
-       u64 l2t_offset;
-       u64 clust_off;
-       u64 l2t_size;
-       u64 clust_sz;
         u64 l1t_idx;
+       u64 l2t_offset;
         u64 l2t_idx;
-       u64 f_sz;
-       u64 len;
+       u64 l2t_size;
+       u64 l2t_new_offset;
  
-       l2t             = NULL;
-       l2t_size        = 1 << header->l2_bits;
-       clust_sz        = 1 << header->cluster_bits;
+       l2t_size = 1 << header->l2_bits;
  
         l1t_idx = get_l1_index(q, offset);
         if (l1t_idx >= l1t->table_size)
@@ -766,122 +841,164 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
         if (l2t_idx >= l2t_size)
                 return -1;
  
-       clust_off = get_cluster_offset(q, offset);
-       if (clust_off >= clust_sz)
-               return -1;
-
-       len = clust_sz - clust_off;
-       if (len > src_len)
-               len = src_len;
-
-       mutex_lock(&q->mutex);
-
         l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
-       if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
-               pr_warning("compressed clusters are not supported");
-               goto error;
-       }
-       if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
-               pr_warning("L2 copy-on-write clusters are not supported");
-               goto error;
-       }
-
-       l2t_offset &= QCOW2_OFFSET_MASK;
-       if (l2t_offset) {
-               /* read and cache l2 table */
+       if (l2t_offset & QCOW2_OFLAG_COPIED) {
+               l2t_offset &= ~QCOW2_OFLAG_COPIED;
                 l2t = qcow_read_l2_table(q, l2t_offset);
                 if (!l2t)
                         goto error;
         } else {
-               l2t = new_cache_table(q, l2t_offset);
-               if (!l2t)
+               l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
+               if (l2t_new_offset < 0)
                         goto error;
  
-               /* Capture the state of the consistent QCOW image */
-               f_sz = file_size(q->fd);
-               if (!f_sz)
-                       goto free_cache;
+               l2t = new_cache_table(q, l2t_new_offset);
+               if (!l2t)
+                       goto free_cluster;
+
+               if (l2t_offset) {
+                       l2t = qcow_read_l2_table(q, l2t_offset);
+                       if (!l2t)
+                               goto free_cache;
+               } else
+                       memset(l2t->table, 0x00, l2t_size * sizeof(u64));
  
-               /* Write the l2 table of 0's at the end of the file */
-               l2t_offset = qcow_write_l2_table(q, l2t->table);
-               if (!l2t_offset)
+               /* write l2 table */
+               l2t->dirty = 1;
+               if (qcow_l2_cache_write(q, l2t) < 0)
                         goto free_cache;
  
-               if (cache_table(q, l2t) < 0) {
-                       if (ftruncate(q->fd, f_sz) < 0)
-                               goto free_cache;
+               /* cache l2 table */
+               if (cache_table(q, l2t))
+                       goto free_cache;
  
+               /* update the l1 talble */
+               l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+                       | QCOW2_OFLAG_COPIED);
+               if (qcow_write_l1_table(q)) {
+                       pr_warning("Update l1 table error");
                         goto free_cache;
                 }
  
-               /* Update the in-core entry */
-               l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
+               /* free old cluster */
+               qcow_free_clusters(q, l2t_offset, q->cluster_size);
         }
  
-       /* Capture the state of the consistent QCOW image */
-       f_sz            = file_size(q->fd);
-       if (!f_sz)
-               goto error;
+       *result_l2t = l2t;
+       *result_l2_index = l2t_idx;
  
-       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+       return 0;
  
-       clust_flags = clust_start & QCOW2_OFLAGS_MASK;
-       if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
-               pr_warning("compressed clusters are not supported");
-               goto error;
-       }
+free_cache:
+       free(l2t);
  
-       clust_start &= QCOW2_OFFSET_MASK;
-       if (!clust_start) {
-               clust_start             = ALIGN(f_sz, clust_sz);
-               l2t->table[l2t_idx]     = cpu_to_be64(clust_start | QCOW2_OFLAG_COPIED);
-               l2t->dirty              = 1;
-       }
+free_cluster:
+       qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
  
-       if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
-               struct qcow_refcount_block *rfb = NULL;
-               u16 clust_refcount;
-               u64 clust_idx;
-               u64 rfb_idx;
+error:
+       return -1;
+}
  
-               clust_idx = (clust_start & QCOW2_OFFSET_MASK)
-                       >> (header->cluster_bits);
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+               void *buf, u32 src_len)
+{
+       struct qcow_l2_table *l2t;
+       u64 clust_new_start;
+       u64 clust_start;
+       u64 clust_flags;
+       u64 clust_off;
+       u64 l2t_idx;
+       u64 len;
  
-               rfb = qcow_read_refcount_block(q, clust_idx);
-               if (!rfb) {
-                       pr_warning("L1: error while reading refcount table");
-                       goto error;
-               }
+       l2t = NULL;
  
-               rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
-               if (rfb_idx >= rfb->size) {
-                       pr_warning("L1: refcount block index out of bounds");
+       clust_off = get_cluster_offset(q, offset);
+       if (clust_off >= q->cluster_size)
+               return -1;
+
+       len = q->cluster_size - clust_off;
+       if (len > src_len)
+               len = src_len;
+
+       mutex_lock(&q->mutex);
+
+       if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+               pr_warning("Get l2 table error");
+               goto error;
+       }
+
+       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+       clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+       clust_start &= QCOW2_OFFSET_MASK;
+       if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+               clust_new_start = qcow_alloc_clusters(q, q->cluster_size);
+               if (clust_new_start < 0) {
+                       pr_warning("Cluster alloc error");
                         goto error;
                 }
  
-               clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
-               if (!clust_refcount) {
-                       clust_refcount = 1;
-                       rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
-                       rfb->dirty = 1;
-               }
+               offset &= ~(q->cluster_size - 1);
+
+               /* if clust_start is not zero, read the original data*/
+               if (clust_start) {
+                       mutex_unlock(&q->mutex);
+                       if (qcow2_read_cluster(q, offset, q->copy_buff,
+                               q->cluster_size) < 0) {
+                               pr_warning("Read copy cluster error");
+                               qcow_free_clusters(q, clust_new_start,
+                                       q->cluster_size);
+                               return -1;
+                       }
+                       mutex_lock(&q->mutex);
+               } else
+                       memset(q->copy_buff, 0x00, q->cluster_size);
+
+               memcpy(q->copy_buff + clust_off, buf, len);
+
+                /* Write actual data */
+               if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+                       clust_new_start) < 0)
+                       goto free_cluster;
+
+               /* update l2 table*/
+               l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+                       | QCOW2_OFLAG_COPIED);
+               l2t->dirty = 1;
+
+               if (qcow_l2_cache_write(q, l2t))
+                       goto free_cluster;
+
+               /* free old cluster*/
+               if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+                       int size;
+                       size = ((clust_start >> q->csize_shift) &
+                               q->csize_mask) + 1;
+                       size *= 512;
+                       clust_start &= q->cluster_offset_mask;
+                       clust_start &= ~511;
+
+                       qcow_free_clusters(q, clust_start, size);
+               } else if (clust_start)
+                       qcow_free_clusters(q, clust_start, q->cluster_size);
  
-               if (clust_refcount > 1) {
-                       pr_warning("L1 copy-on-write clusters are not supported");
+       } else {
+               /* Write actual data */
+               if (pwrite_in_full(q->fd, buf, len,
+                       clust_start + clust_off) < 0)
                         goto error;
-               }
         }
-
         mutex_unlock(&q->mutex);
-
-       /* Write actual data */
-       if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
-               return -1;
-
         return len;
  
-free_cache:
-       free(l2t);
+free_cluster:
+       qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
  error:
         mutex_unlock(&q->mutex);
         return -1;
@@ -940,11 +1057,9 @@ static int qcow_disk_flush(struct disk_image *disk)
  {
         struct qcow *q = disk->priv;
         struct qcow_refcount_table *rft;
-       struct qcow_header *header;
         struct list_head *pos, *n;
         struct qcow_l1_table *l1t;
  
-       header = q->header;
         l1t = &q->table;
         rft = &q->refcount_table;
  
@@ -957,9 +1072,6 @@ static int qcow_disk_flush(struct disk_image *disk)
                         goto error_unlock;
         }
  
-       if (fdatasync(disk->fd) < 0)
-               goto error_unlock;
-
         list_for_each_safe(pos, n, &l1t->lru_list) {
                 struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
  
@@ -967,10 +1079,7 @@ static int qcow_disk_flush(struct disk_image *disk)
                         goto error_unlock;
         }
  
-       if (fdatasync(disk->fd) < 0)
-               goto error_unlock;
-
-       if (pwrite_in_full(disk->fd, l1t->l1_table, l1t->table_size * sizeof(u64), header->l1_table_offset) < 0)
+       if (qcow_write_l1_table < 0)
                 goto error_unlock;
  
         mutex_unlock(&q->mutex);
@@ -993,6 +1102,7 @@ static int qcow_disk_close(struct disk_image *disk)
  
         refcount_table_free_cache(&q->refcount_table);
         l1_table_free_cache(&q->table);
+       free(q->copy_buff);
         free(q->cluster_data);
         free(q->cluster_cache);
         free(q->refcount_table.rf_table);
@@ -1117,15 +1227,21 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
         q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
         q->cluster_size = 1 << q->header->cluster_bits;
  
+       q->copy_buff = malloc(q->cluster_size);
+       if (!q->copy_buff) {
+               pr_warning("copy buff malloc error");
+               goto free_header;
+       }
+
         q->cluster_data = malloc(q->cluster_size);
         if (!q->cluster_data) {
-               pr_warning("cluster data malloc error!");
-               goto free_header;
+               pr_warning("cluster data malloc error");
+               goto free_copy_buff;
         }
  
         q->cluster_cache = malloc(q->cluster_size);
         if (!q->cluster_cache) {
-               pr_warning("cluster cache malloc error!");
+               pr_warning("cluster cache malloc error");
                 goto free_cluster_data;
         }
  
@@ -1163,6 +1279,9 @@ free_cluster_cache:
  free_cluster_data:
         if (q->cluster_data)
                 free(q->cluster_data);
+free_copy_buff:
+       if (q->copy_buff)
+               free(q->copy_buff);
  free_header:
         if (q->header)
                 free(q->header);
@@ -1252,16 +1371,17 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
         q->version = QCOW1_VERSION;
         q->cluster_size = 1 << q->header->cluster_bits;
         q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+       q->free_clust_idx = 0;
  
         q->cluster_data = malloc(q->cluster_size);
         if (!q->cluster_data) {
-               pr_warning("cluster data malloc error!");
+               pr_warning("cluster data malloc error");
                 goto free_header;
         }
  
         q->cluster_cache = malloc(q->cluster_size);
         if (!q->cluster_cache) {
-               pr_warning("cluster cache malloc error!");
+               pr_warning("cluster cache malloc error");
                 goto free_cluster_data;
         }
  
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h

index bbf791396f8ce38fc5e15a9bd59e335243c0410d..e032a1e1e3c3faf2e534bad279a746225fc99685 100644 (file)
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -84,8 +84,10 @@ struct qcow {
         u32                             version;
         u64                             cluster_size;
         u64                             cluster_offset_mask;
+       u64                             free_clust_idx;
         void                            *cluster_cache;
         void                            *cluster_data;
+       void                            *copy_buff;
  };
  
  struct qcow1_header_disk {
author	Lan Tianyu <tianyu.lan@intel.com>
	Tue, 29 Nov 2011 07:30:26 +0000 (15:30 +0800)
committer	Pekka Enberg <penberg@kernel.org>
	Tue, 29 Nov 2011 18:07:12 +0000 (20:07 +0200)
tools/kvm/disk/qcow.c		patch \| blob \| history
tools/kvm/include/kvm/qcow.h		patch \| blob \| history