20209d8d73
Excerpt from Fiona's v3 cover-letter [0]: When a backup for a VM is started, QEMU will install a "copy-before-write" filter in its block layer. This filter ensures that upon new guest writes, old data still needed for the backup is sent to the backup target first. The guest write blocks until this operation is finished so guest IO to not-yet-backed-up sectors will be limited by the speed of the backup target. With backup fleecing, such old data is cached in a fleecing image rather than sent directly to the backup target. This can help guest IO performance and even prevent hangs in certain scenarios, at the cost of requiring more storage space. With this series it will be possible to enable backup-fleecing via e.g. `vzdump 123 --fleecing enabled=1,storage=local-lvm` with fleecing images created on the storage `local-lvm`. The fleecing storage should be a fast local storage which supports thin-provisioning and discard. If the storage supports qcow2, that is used as the fleecing image format. If the underlying file system does not support discard, with qcow2 and preallocation=off, at least already allocated parts of the image can be re-used later. Fleecing images are created by qemu-server via pve-storage and attached to QEMU before the backup starts, and cleaned up after the backup finished or failed. The naming schema for fleecing images is 'vm-ID-fleece-N(.FORMAT)'. The allocated images are recorded in the guest configuration, so that even after a hard failure, clean-up can be re-attempted. While not too bad, it's a non-trivial amount of code and I'm not 100% sure about the cost-benefit, so sending those as RFC. The fleecing image needs to be the exact same size as the source, but luckily, an explicit size can be specified when attaching a raw image to QEMU so there are no size issues when using storages that have coarser allocation/round up. For qcow2, it seems that virtual size can be nearly arbitrary (i.e. modulo 512 byte granularity) during allocation. [0]: https://lists.proxmox.com/pipermail/pve-devel/2024-April/062815.html Originally-by: Fiona Ebner <f.ebner@proxmox.com> Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
134 lines
5.7 KiB
Diff
134 lines
5.7 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Fiona Ebner <f.ebner@proxmox.com>
|
|
Date: Thu, 11 Apr 2024 11:29:26 +0200
|
|
Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
|
|
|
|
Useful to make discard-source work in the context of backup fleecing
|
|
when the fleecing image has a larger granularity than the backup
|
|
target.
|
|
|
|
Copy-before-write operations will use at least this granularity and in
|
|
particular, discard requests to the source node will too. If the
|
|
granularity is too small, they will just be aligned down in
|
|
cbw_co_pdiscard_snapshot() and thus effectively ignored.
|
|
|
|
The QAPI uses uint32 so the value will be non-negative, but still fit
|
|
into a uint64_t.
|
|
|
|
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
|
|
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
|
---
|
|
block/block-copy.c | 17 +++++++++++++----
|
|
block/copy-before-write.c | 3 ++-
|
|
include/block/block-copy.h | 1 +
|
|
qapi/block-core.json | 8 +++++++-
|
|
4 files changed, 23 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/block/block-copy.c b/block/block-copy.c
|
|
index 3c61e52bae..c9a722a5a6 100644
|
|
--- a/block/block-copy.c
|
|
+++ b/block/block-copy.c
|
|
@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
|
|
}
|
|
|
|
static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
|
|
+ int64_t min_cluster_size,
|
|
Error **errp)
|
|
{
|
|
int ret;
|
|
@@ -330,7 +331,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
|
|
"used. If the actual block size of the target exceeds "
|
|
"this default, the backup may be unusable",
|
|
BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
|
|
- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
|
|
+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
|
|
} else if (ret < 0 && !target_does_cow) {
|
|
error_setg_errno(errp, -ret,
|
|
"Couldn't determine the cluster size of the target image, "
|
|
@@ -340,16 +341,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
|
|
return ret;
|
|
} else if (ret < 0 && target_does_cow) {
|
|
/* Not fatal; just trudge on ahead. */
|
|
- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
|
|
+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
|
|
}
|
|
|
|
- return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
|
|
+ return MAX(min_cluster_size,
|
|
+ MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
|
|
}
|
|
|
|
BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
|
|
BlockDriverState *copy_bitmap_bs,
|
|
const BdrvDirtyBitmap *bitmap,
|
|
bool discard_source,
|
|
+ int64_t min_cluster_size,
|
|
Error **errp)
|
|
{
|
|
ERRP_GUARD();
|
|
@@ -358,7 +361,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
|
|
BdrvDirtyBitmap *copy_bitmap;
|
|
bool is_fleecing;
|
|
|
|
- cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
|
|
+ if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
|
|
+ error_setg(errp, "min-cluster-size needs to be a power of 2");
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ cluster_size = block_copy_calculate_cluster_size(target->bs,
|
|
+ min_cluster_size, errp);
|
|
if (cluster_size < 0) {
|
|
return NULL;
|
|
}
|
|
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
|
|
index 3503702d71..4a8c5bdb62 100644
|
|
--- a/block/copy-before-write.c
|
|
+++ b/block/copy-before-write.c
|
|
@@ -479,7 +479,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
|
|
s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
|
|
- flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
|
|
+ flags & BDRV_O_CBW_DISCARD_SOURCE,
|
|
+ opts->min_cluster_size, errp);
|
|
if (!s->bcs) {
|
|
error_prepend(errp, "Cannot create block-copy-state: ");
|
|
ret = -EINVAL;
|
|
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
|
|
index bdc703bacd..77857c6c68 100644
|
|
--- a/include/block/block-copy.h
|
|
+++ b/include/block/block-copy.h
|
|
@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
|
|
BlockDriverState *copy_bitmap_bs,
|
|
const BdrvDirtyBitmap *bitmap,
|
|
bool discard_source,
|
|
+ int64_t min_cluster_size,
|
|
Error **errp);
|
|
|
|
/* Function should be called prior any actual copy request */
|
|
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
|
index 4297e5beda..33e7e3c090 100644
|
|
--- a/qapi/block-core.json
|
|
+++ b/qapi/block-core.json
|
|
@@ -4825,12 +4825,18 @@
|
|
# @on-cbw-error parameter will decide how this failure is handled.
|
|
# Default 0. (Since 7.1)
|
|
#
|
|
+# @min-cluster-size: Minimum size of blocks used by copy-before-write
|
|
+# operations. Has to be a power of 2. No effect if smaller than
|
|
+# the maximum of the target's cluster size and 64 KiB. Default 0.
|
|
+# (Since 8.1)
|
|
+#
|
|
# Since: 6.2
|
|
##
|
|
{ 'struct': 'BlockdevOptionsCbw',
|
|
'base': 'BlockdevOptionsGenericFormat',
|
|
'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
|
|
- '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
|
|
+ '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
|
|
+ '*min-cluster-size': 'uint32' } }
|
|
|
|
##
|
|
# @BlockdevOptions:
|