fix vmstate-snapshots with iothread=1

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2020-05-27 14:40:46 +02:00
parent 63be960208
commit f063a8aadb
5 changed files with 416 additions and 0 deletions

View File

@ -0,0 +1,188 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Wed, 27 May 2020 11:33:19 +0200
Subject: [PATCH] savevm-async: move more code to cleanup and rename to
finalize
process_savevm_cleanup is renamed to process_savevm_finalize to
accomodate more code that is not all cleanup related.
The benefit of this is that it allows us to call functions which need to
run in the main AIOContext directly. It doesn't majorly affect snapshot
performance, since the first instruction that is moved stops the VM,
so the downtime stays about the same.
The target bdrv is additionally moved to the IOHandler context before
process_savevm_co to make sure the coroutine can call functions that
require it to own the bdrv's context. process_savevm_finalize then moves
it back to the main context to run its part.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---
savevm-async.c | 87 +++++++++++++++++++++++++++++---------------------
1 file changed, 51 insertions(+), 36 deletions(-)
diff --git a/savevm-async.c b/savevm-async.c
index c3fe741c38..2894c94233 100644
--- a/savevm-async.c
+++ b/savevm-async.c
@@ -50,7 +50,7 @@ static struct SnapshotState {
int saved_vm_running;
QEMUFile *file;
int64_t total_time;
- QEMUBH *cleanup_bh;
+ QEMUBH *finalize_bh;
Coroutine *co;
} snap_state;
@@ -196,12 +196,42 @@ static const QEMUFileOps block_file_ops = {
.close = block_state_close,
};
-static void process_savevm_cleanup(void *opaque)
+static void process_savevm_finalize(void *opaque)
{
int ret;
- qemu_bh_delete(snap_state.cleanup_bh);
- snap_state.cleanup_bh = NULL;
+ AioContext *iohandler_ctx = iohandler_get_aio_context();
+ MigrationState *ms = migrate_get_current();
+
+ qemu_bh_delete(snap_state.finalize_bh);
+ snap_state.finalize_bh = NULL;
snap_state.co = NULL;
+
+ /* We need to own the target bdrv's context for the following functions,
+ * so move it back. It can stay in the main context and live out its live
+ * there, since we're done with it after this method ends anyway.
+ */
+ aio_context_acquire(iohandler_ctx);
+ blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
+ aio_context_release(iohandler_ctx);
+
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret < 0) {
+ save_snapshot_error("vm_stop_force_state error %d", ret);
+ }
+
+ (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
+ ret = qemu_file_get_error(snap_state.file);
+ if (ret < 0) {
+ save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+ }
+
+ DPRINTF("state saving complete\n");
+
+ /* clear migration state */
+ migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+ ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
+ ms->to_dst_file = NULL;
+
qemu_savevm_state_cleanup();
ret = save_snapshot_cleanup();
@@ -219,16 +249,15 @@ static void process_savevm_cleanup(void *opaque)
}
}
-static void process_savevm_coro(void *opaque)
+static void coroutine_fn process_savevm_co(void *opaque)
{
int ret;
int64_t maxlen;
- MigrationState *ms = migrate_get_current();
ret = qemu_file_get_error(snap_state.file);
if (ret < 0) {
save_snapshot_error("qemu_savevm_state_setup failed");
- goto out;
+ return;
}
while (snap_state.state == SAVE_STATE_ACTIVE) {
@@ -245,7 +274,7 @@ static void process_savevm_coro(void *opaque)
save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
break;
}
- DPRINTF("savevm inerate pending size %lu ret %d\n", pending_size, ret);
+ DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret);
} else {
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
ret = global_state_store();
@@ -253,40 +282,20 @@ static void process_savevm_coro(void *opaque)
save_snapshot_error("global_state_store error %d", ret);
break;
}
- ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
- if (ret < 0) {
- save_snapshot_error("vm_stop_force_state error %d", ret);
- break;
- }
- DPRINTF("savevm inerate finished\n");
- /* upstream made the return value here inconsistent
- * (-1 instead of 'ret' in one case and 0 after flush which can
- * still set a file error...)
- */
- (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
- ret = qemu_file_get_error(snap_state.file);
- if (ret < 0) {
- save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
- break;
- }
- DPRINTF("save complete\n");
+
+ DPRINTF("savevm iterate complete\n");
break;
}
}
- qemu_bh_schedule(snap_state.cleanup_bh);
-
-out:
- /* set migration state accordingly and clear soon-to-be stale file */
- migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
- ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
- ms->to_dst_file = NULL;
+ qemu_bh_schedule(snap_state.finalize_bh);
}
void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
{
Error *local_err = NULL;
MigrationState *ms = migrate_get_current();
+ AioContext *iohandler_ctx = iohandler_get_aio_context();
int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH;
@@ -347,7 +356,7 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
/*
* qemu_savevm_* paths use migration code and expect a migration state.
- * State is cleared in process_savevm_thread, but has to be initialized
+ * State is cleared in process_savevm_co, but has to be initialized
* here (blocking main thread, from QMP) to avoid race conditions.
*/
migrate_init(ms);
@@ -358,13 +367,19 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
blk_op_block_all(snap_state.target, snap_state.blocker);
snap_state.state = SAVE_STATE_ACTIVE;
- snap_state.cleanup_bh = qemu_bh_new(process_savevm_cleanup, &snap_state);
- snap_state.co = qemu_coroutine_create(&process_savevm_coro, NULL);
+ snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
+ snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
qemu_mutex_unlock_iothread();
qemu_savevm_state_header(snap_state.file);
qemu_savevm_state_setup(snap_state.file);
qemu_mutex_lock_iothread();
- aio_co_schedule(iohandler_get_aio_context(), snap_state.co);
+
+ /* Async processing from here on out happens in iohandler context, so let
+ * the target bdrv have its home there.
+ */
+ blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err);
+
+ aio_co_schedule(iohandler_ctx, snap_state.co);
return;

View File

@ -0,0 +1,86 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 27 May 2020 11:33:20 +0200
Subject: [PATCH] util/async: Add aio_co_reschedule_self()
From: Kevin Wolf <kwolf@redhat.com>
Add a function that can be used to move the currently running coroutine
to a different AioContext (and therefore potentially a different
thread).
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio.h | 10 ++++++++++
util/async.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/include/block/aio.h b/include/block/aio.h
index 62ed954344..d5399c67d6 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -17,6 +17,7 @@
#ifdef CONFIG_LINUX_IO_URING
#include <liburing.h>
#endif
+#include "qemu/coroutine.h"
#include "qemu/queue.h"
#include "qemu/event_notifier.h"
#include "qemu/thread.h"
@@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
*/
void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
/**
* aio_co_wake:
* @co: the coroutine
diff --git a/util/async.c b/util/async.c
index 3165a28f2f..4eba1e6f1b 100644
--- a/util/async.c
+++ b/util/async.c
@@ -558,6 +558,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co)
aio_context_unref(ctx);
}
+typedef struct AioCoRescheduleSelf {
+ Coroutine *co;
+ AioContext *new_ctx;
+} AioCoRescheduleSelf;
+
+static void aio_co_reschedule_self_bh(void *opaque)
+{
+ AioCoRescheduleSelf *data = opaque;
+ aio_co_schedule(data->new_ctx, data->co);
+}
+
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
+{
+ AioContext *old_ctx = qemu_get_current_aio_context();
+
+ if (old_ctx != new_ctx) {
+ AioCoRescheduleSelf data = {
+ .co = qemu_coroutine_self(),
+ .new_ctx = new_ctx,
+ };
+ /*
+ * We can't directly schedule the coroutine in the target context
+ * because this would be racy: The other thread could try to enter the
+ * coroutine before it has yielded in this one.
+ */
+ aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
+ qemu_coroutine_yield();
+ }
+}
+
void aio_co_wake(struct Coroutine *co)
{
AioContext *ctx;

View File

@ -0,0 +1,58 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Wed, 27 May 2020 11:33:21 +0200
Subject: [PATCH] savevm-async: flush IOThread-drives async before entering
blocking part
By flushing all drives where its possible to so before entering the
blocking part (where the VM is stopped), we can reduce the time spent in
said part for every disk that has an IOThread (other drives cannot be
flushed async anyway).
Suggested-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---
savevm-async.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/savevm-async.c b/savevm-async.c
index 2894c94233..4ce83a0691 100644
--- a/savevm-async.c
+++ b/savevm-async.c
@@ -253,6 +253,8 @@ static void coroutine_fn process_savevm_co(void *opaque)
{
int ret;
int64_t maxlen;
+ BdrvNextIterator it;
+ BlockDriverState *bs = NULL;
ret = qemu_file_get_error(snap_state.file);
if (ret < 0) {
@@ -288,6 +290,27 @@ static void coroutine_fn process_savevm_co(void *opaque)
}
}
+ /* If a drive runs in an IOThread we can flush it async, and only
+ * need to sync-flush whatever IO happens between now and
+ * vm_stop_force_state. bdrv_next can only be called from main AioContext,
+ * so move there now and after every flush.
+ */
+ aio_co_reschedule_self(qemu_get_aio_context());
+ for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
+ if (bs == blk_bs(snap_state.target)) {
+ continue;
+ }
+
+ AioContext *bs_ctx = bdrv_get_aio_context(bs);
+ if (bs_ctx != qemu_get_aio_context()) {
+ DPRINTF("savevm: async flushing drive %s\n", bs->filename);
+ aio_co_reschedule_self(bs_ctx);
+ bdrv_flush(bs);
+ aio_co_reschedule_self(qemu_get_aio_context());
+ }
+ }
+
qemu_bh_schedule(snap_state.finalize_bh);
}

View File

@ -0,0 +1,80 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Wed, 27 May 2020 11:33:22 +0200
Subject: [PATCH] savevm-async: add debug timing prints
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
[ Thomas: guard variable declaration by DEBUG #ifdef ]
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
savevm-async.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/savevm-async.c b/savevm-async.c
index 4ce83a0691..0388cebbe9 100644
--- a/savevm-async.c
+++ b/savevm-async.c
@@ -202,6 +202,10 @@ static void process_savevm_finalize(void *opaque)
AioContext *iohandler_ctx = iohandler_get_aio_context();
MigrationState *ms = migrate_get_current();
+#ifdef DEBUG_SAVEVM_STATE
+ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
+
qemu_bh_delete(snap_state.finalize_bh);
snap_state.finalize_bh = NULL;
snap_state.co = NULL;
@@ -226,6 +230,8 @@ static void process_savevm_finalize(void *opaque)
}
DPRINTF("state saving complete\n");
+ DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n",
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
/* clear migration state */
migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
@@ -247,6 +253,9 @@ static void process_savevm_finalize(void *opaque)
vm_start();
snap_state.saved_vm_running = false;
}
+
+ DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n",
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
}
static void coroutine_fn process_savevm_co(void *opaque)
@@ -256,6 +265,10 @@ static void coroutine_fn process_savevm_co(void *opaque)
BdrvNextIterator it;
BlockDriverState *bs = NULL;
+#ifdef DEBUG_SAVEVM_STATE
+ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
+
ret = qemu_file_get_error(snap_state.file);
if (ret < 0) {
save_snapshot_error("qemu_savevm_state_setup failed");
@@ -290,6 +303,12 @@ static void coroutine_fn process_savevm_co(void *opaque)
}
}
+ DPRINTF("timing: process_savevm_co took %ld ms\n",
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+
+#ifdef DEBUG_SAVEVM_STATE
+ int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+#endif
/* If a drive runs in an IOThread we can flush it async, and only
* need to sync-flush whatever IO happens between now and
* vm_stop_force_state. bdrv_next can only be called from main AioContext,
@@ -311,6 +330,9 @@ static void coroutine_fn process_savevm_co(void *opaque)
}
}
+ DPRINTF("timing: async flushing took %ld ms\n",
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush);
+
qemu_bh_schedule(snap_state.finalize_bh);
}

View File

@ -42,3 +42,7 @@ pve/0041-PVE-Backup-avoid-use-QemuRecMutex-inside-coroutines.patch
pve/0042-PVE-Backup-use-QemuMutex-instead-of-QemuRecMutex.patch pve/0042-PVE-Backup-use-QemuMutex-instead-of-QemuRecMutex.patch
pve/0043-move-savevm-async-back-into-a-coroutine.patch pve/0043-move-savevm-async-back-into-a-coroutine.patch
pve/0044-add-optional-buffer-size-to-QEMUFile.patch pve/0044-add-optional-buffer-size-to-QEMUFile.patch
pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
pve/0046-util-async-Add-aio_co_reschedule_self.patch
pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
pve/0048-savevm-async-add-debug-timing-prints.patch