From f063a8aadb70d2601173c0c622e1677ce59d05c3 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Wed, 27 May 2020 14:40:46 +0200 Subject: [PATCH] fix vmstate-snapshots with iothread=1 Signed-off-by: Thomas Lamprecht --- ...e-more-code-to-cleanup-and-rename-to.patch | 188 ++++++++++++++++++ ...til-async-Add-aio_co_reschedule_self.patch | 86 ++++++++ ...sh-IOThread-drives-async-before-ente.patch | 58 ++++++ ...savevm-async-add-debug-timing-prints.patch | 80 ++++++++ debian/patches/series | 4 + 5 files changed, 416 insertions(+) create mode 100644 debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch create mode 100644 debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch create mode 100644 debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch create mode 100644 debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch diff --git a/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch b/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch new file mode 100644 index 0000000..1880717 --- /dev/null +++ b/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch @@ -0,0 +1,188 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Wed, 27 May 2020 11:33:19 +0200 +Subject: [PATCH] savevm-async: move more code to cleanup and rename to + finalize + +process_savevm_cleanup is renamed to process_savevm_finalize to +accomodate more code that is not all cleanup related. + +The benefit of this is that it allows us to call functions which need to +run in the main AIOContext directly. It doesn't majorly affect snapshot +performance, since the first instruction that is moved stops the VM, +so the downtime stays about the same. + +The target bdrv is additionally moved to the IOHandler context before +process_savevm_co to make sure the coroutine can call functions that +require it to own the bdrv's context. process_savevm_finalize then moves +it back to the main context to run its part. + +Signed-off-by: Stefan Reiter +--- + savevm-async.c | 87 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 51 insertions(+), 36 deletions(-) + +diff --git a/savevm-async.c b/savevm-async.c +index c3fe741c38..2894c94233 100644 +--- a/savevm-async.c ++++ b/savevm-async.c +@@ -50,7 +50,7 @@ static struct SnapshotState { + int saved_vm_running; + QEMUFile *file; + int64_t total_time; +- QEMUBH *cleanup_bh; ++ QEMUBH *finalize_bh; + Coroutine *co; + } snap_state; + +@@ -196,12 +196,42 @@ static const QEMUFileOps block_file_ops = { + .close = block_state_close, + }; + +-static void process_savevm_cleanup(void *opaque) ++static void process_savevm_finalize(void *opaque) + { + int ret; +- qemu_bh_delete(snap_state.cleanup_bh); +- snap_state.cleanup_bh = NULL; ++ AioContext *iohandler_ctx = iohandler_get_aio_context(); ++ MigrationState *ms = migrate_get_current(); ++ ++ qemu_bh_delete(snap_state.finalize_bh); ++ snap_state.finalize_bh = NULL; + snap_state.co = NULL; ++ ++ /* We need to own the target bdrv's context for the following functions, ++ * so move it back. It can stay in the main context and live out its live ++ * there, since we're done with it after this method ends anyway. ++ */ ++ aio_context_acquire(iohandler_ctx); ++ blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL); ++ aio_context_release(iohandler_ctx); ++ ++ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); ++ if (ret < 0) { ++ save_snapshot_error("vm_stop_force_state error %d", ret); ++ } ++ ++ (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false); ++ ret = qemu_file_get_error(snap_state.file); ++ if (ret < 0) { ++ save_snapshot_error("qemu_savevm_state_iterate error %d", ret); ++ } ++ ++ DPRINTF("state saving complete\n"); ++ ++ /* clear migration state */ ++ migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, ++ ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED); ++ ms->to_dst_file = NULL; ++ + qemu_savevm_state_cleanup(); + + ret = save_snapshot_cleanup(); +@@ -219,16 +249,15 @@ static void process_savevm_cleanup(void *opaque) + } + } + +-static void process_savevm_coro(void *opaque) ++static void coroutine_fn process_savevm_co(void *opaque) + { + int ret; + int64_t maxlen; +- MigrationState *ms = migrate_get_current(); + + ret = qemu_file_get_error(snap_state.file); + if (ret < 0) { + save_snapshot_error("qemu_savevm_state_setup failed"); +- goto out; ++ return; + } + + while (snap_state.state == SAVE_STATE_ACTIVE) { +@@ -245,7 +274,7 @@ static void process_savevm_coro(void *opaque) + save_snapshot_error("qemu_savevm_state_iterate error %d", ret); + break; + } +- DPRINTF("savevm inerate pending size %lu ret %d\n", pending_size, ret); ++ DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret); + } else { + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + ret = global_state_store(); +@@ -253,40 +282,20 @@ static void process_savevm_coro(void *opaque) + save_snapshot_error("global_state_store error %d", ret); + break; + } +- ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +- if (ret < 0) { +- save_snapshot_error("vm_stop_force_state error %d", ret); +- break; +- } +- DPRINTF("savevm inerate finished\n"); +- /* upstream made the return value here inconsistent +- * (-1 instead of 'ret' in one case and 0 after flush which can +- * still set a file error...) +- */ +- (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false); +- ret = qemu_file_get_error(snap_state.file); +- if (ret < 0) { +- save_snapshot_error("qemu_savevm_state_iterate error %d", ret); +- break; +- } +- DPRINTF("save complete\n"); ++ ++ DPRINTF("savevm iterate complete\n"); + break; + } + } + +- qemu_bh_schedule(snap_state.cleanup_bh); +- +-out: +- /* set migration state accordingly and clear soon-to-be stale file */ +- migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, +- ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED); +- ms->to_dst_file = NULL; ++ qemu_bh_schedule(snap_state.finalize_bh); + } + + void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp) + { + Error *local_err = NULL; + MigrationState *ms = migrate_get_current(); ++ AioContext *iohandler_ctx = iohandler_get_aio_context(); + + int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH; + +@@ -347,7 +356,7 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp) + + /* + * qemu_savevm_* paths use migration code and expect a migration state. +- * State is cleared in process_savevm_thread, but has to be initialized ++ * State is cleared in process_savevm_co, but has to be initialized + * here (blocking main thread, from QMP) to avoid race conditions. + */ + migrate_init(ms); +@@ -358,13 +367,19 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp) + blk_op_block_all(snap_state.target, snap_state.blocker); + + snap_state.state = SAVE_STATE_ACTIVE; +- snap_state.cleanup_bh = qemu_bh_new(process_savevm_cleanup, &snap_state); +- snap_state.co = qemu_coroutine_create(&process_savevm_coro, NULL); ++ snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state); ++ snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL); + qemu_mutex_unlock_iothread(); + qemu_savevm_state_header(snap_state.file); + qemu_savevm_state_setup(snap_state.file); + qemu_mutex_lock_iothread(); +- aio_co_schedule(iohandler_get_aio_context(), snap_state.co); ++ ++ /* Async processing from here on out happens in iohandler context, so let ++ * the target bdrv have its home there. ++ */ ++ blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err); ++ ++ aio_co_schedule(iohandler_ctx, snap_state.co); + + return; + diff --git a/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch b/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch new file mode 100644 index 0000000..6791261 --- /dev/null +++ b/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch @@ -0,0 +1,86 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 27 May 2020 11:33:20 +0200 +Subject: [PATCH] util/async: Add aio_co_reschedule_self() + +From: Kevin Wolf + +Add a function that can be used to move the currently running coroutine +to a different AioContext (and therefore potentially a different +thread). + +Signed-off-by: Kevin Wolf +--- + include/block/aio.h | 10 ++++++++++ + util/async.c | 30 ++++++++++++++++++++++++++++++ + 2 files changed, 40 insertions(+) + +diff --git a/include/block/aio.h b/include/block/aio.h +index 62ed954344..d5399c67d6 100644 +--- a/include/block/aio.h ++++ b/include/block/aio.h +@@ -17,6 +17,7 @@ + #ifdef CONFIG_LINUX_IO_URING + #include + #endif ++#include "qemu/coroutine.h" + #include "qemu/queue.h" + #include "qemu/event_notifier.h" + #include "qemu/thread.h" +@@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external) + */ + void aio_co_schedule(AioContext *ctx, struct Coroutine *co); + ++/** ++ * aio_co_reschedule_self: ++ * @new_ctx: the new context ++ * ++ * Move the currently running coroutine to new_ctx. If the coroutine is already ++ * running in new_ctx, do nothing. ++ */ ++void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); ++ + /** + * aio_co_wake: + * @co: the coroutine +diff --git a/util/async.c b/util/async.c +index 3165a28f2f..4eba1e6f1b 100644 +--- a/util/async.c ++++ b/util/async.c +@@ -558,6 +558,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co) + aio_context_unref(ctx); + } + ++typedef struct AioCoRescheduleSelf { ++ Coroutine *co; ++ AioContext *new_ctx; ++} AioCoRescheduleSelf; ++ ++static void aio_co_reschedule_self_bh(void *opaque) ++{ ++ AioCoRescheduleSelf *data = opaque; ++ aio_co_schedule(data->new_ctx, data->co); ++} ++ ++void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx) ++{ ++ AioContext *old_ctx = qemu_get_current_aio_context(); ++ ++ if (old_ctx != new_ctx) { ++ AioCoRescheduleSelf data = { ++ .co = qemu_coroutine_self(), ++ .new_ctx = new_ctx, ++ }; ++ /* ++ * We can't directly schedule the coroutine in the target context ++ * because this would be racy: The other thread could try to enter the ++ * coroutine before it has yielded in this one. ++ */ ++ aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data); ++ qemu_coroutine_yield(); ++ } ++} ++ + void aio_co_wake(struct Coroutine *co) + { + AioContext *ctx; diff --git a/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch b/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch new file mode 100644 index 0000000..b68ec1f --- /dev/null +++ b/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch @@ -0,0 +1,58 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Wed, 27 May 2020 11:33:21 +0200 +Subject: [PATCH] savevm-async: flush IOThread-drives async before entering + blocking part + +By flushing all drives where its possible to so before entering the +blocking part (where the VM is stopped), we can reduce the time spent in +said part for every disk that has an IOThread (other drives cannot be +flushed async anyway). + +Suggested-by: Thomas Lamprecht +Signed-off-by: Stefan Reiter +--- + savevm-async.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/savevm-async.c b/savevm-async.c +index 2894c94233..4ce83a0691 100644 +--- a/savevm-async.c ++++ b/savevm-async.c +@@ -253,6 +253,8 @@ static void coroutine_fn process_savevm_co(void *opaque) + { + int ret; + int64_t maxlen; ++ BdrvNextIterator it; ++ BlockDriverState *bs = NULL; + + ret = qemu_file_get_error(snap_state.file); + if (ret < 0) { +@@ -288,6 +290,27 @@ static void coroutine_fn process_savevm_co(void *opaque) + } + } + ++ /* If a drive runs in an IOThread we can flush it async, and only ++ * need to sync-flush whatever IO happens between now and ++ * vm_stop_force_state. bdrv_next can only be called from main AioContext, ++ * so move there now and after every flush. ++ */ ++ aio_co_reschedule_self(qemu_get_aio_context()); ++ for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { ++ /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */ ++ if (bs == blk_bs(snap_state.target)) { ++ continue; ++ } ++ ++ AioContext *bs_ctx = bdrv_get_aio_context(bs); ++ if (bs_ctx != qemu_get_aio_context()) { ++ DPRINTF("savevm: async flushing drive %s\n", bs->filename); ++ aio_co_reschedule_self(bs_ctx); ++ bdrv_flush(bs); ++ aio_co_reschedule_self(qemu_get_aio_context()); ++ } ++ } ++ + qemu_bh_schedule(snap_state.finalize_bh); + } + diff --git a/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch b/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch new file mode 100644 index 0000000..68b7635 --- /dev/null +++ b/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch @@ -0,0 +1,80 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Stefan Reiter +Date: Wed, 27 May 2020 11:33:22 +0200 +Subject: [PATCH] savevm-async: add debug timing prints + +Signed-off-by: Stefan Reiter +[ Thomas: guard variable declaration by DEBUG #ifdef ] +Signed-off-by: Thomas Lamprecht +--- + savevm-async.c | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/savevm-async.c b/savevm-async.c +index 4ce83a0691..0388cebbe9 100644 +--- a/savevm-async.c ++++ b/savevm-async.c +@@ -202,6 +202,10 @@ static void process_savevm_finalize(void *opaque) + AioContext *iohandler_ctx = iohandler_get_aio_context(); + MigrationState *ms = migrate_get_current(); + ++#ifdef DEBUG_SAVEVM_STATE ++ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); ++#endif ++ + qemu_bh_delete(snap_state.finalize_bh); + snap_state.finalize_bh = NULL; + snap_state.co = NULL; +@@ -226,6 +230,8 @@ static void process_savevm_finalize(void *opaque) + } + + DPRINTF("state saving complete\n"); ++ DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n", ++ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); + + /* clear migration state */ + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, +@@ -247,6 +253,9 @@ static void process_savevm_finalize(void *opaque) + vm_start(); + snap_state.saved_vm_running = false; + } ++ ++ DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n", ++ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); + } + + static void coroutine_fn process_savevm_co(void *opaque) +@@ -256,6 +265,10 @@ static void coroutine_fn process_savevm_co(void *opaque) + BdrvNextIterator it; + BlockDriverState *bs = NULL; + ++#ifdef DEBUG_SAVEVM_STATE ++ int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); ++#endif ++ + ret = qemu_file_get_error(snap_state.file); + if (ret < 0) { + save_snapshot_error("qemu_savevm_state_setup failed"); +@@ -290,6 +303,12 @@ static void coroutine_fn process_savevm_co(void *opaque) + } + } + ++ DPRINTF("timing: process_savevm_co took %ld ms\n", ++ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); ++ ++#ifdef DEBUG_SAVEVM_STATE ++ int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); ++#endif + /* If a drive runs in an IOThread we can flush it async, and only + * need to sync-flush whatever IO happens between now and + * vm_stop_force_state. bdrv_next can only be called from main AioContext, +@@ -311,6 +330,9 @@ static void coroutine_fn process_savevm_co(void *opaque) + } + } + ++ DPRINTF("timing: async flushing took %ld ms\n", ++ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush); ++ + qemu_bh_schedule(snap_state.finalize_bh); + } + diff --git a/debian/patches/series b/debian/patches/series index 670b744..e7345ce 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -42,3 +42,7 @@ pve/0041-PVE-Backup-avoid-use-QemuRecMutex-inside-coroutines.patch pve/0042-PVE-Backup-use-QemuMutex-instead-of-QemuRecMutex.patch pve/0043-move-savevm-async-back-into-a-coroutine.patch pve/0044-add-optional-buffer-size-to-QEMUFile.patch +pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch +pve/0046-util-async-Add-aio_co_reschedule_self.patch +pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch +pve/0048-savevm-async-add-debug-timing-prints.patch