bf251437e9
Many changes were necessary this time around: * QAPI was changed to avoid redundant has_* variables, see commit 44ea9d9be3 ("qapi: Start to elide redundant has_FOO in generated C") for details. This affected many QMP commands added by Proxmox too. * Pending querying for migration got split into two functions, one to estimate, one for exact value, see commit c8df4a7aef ("migration: Split save_live_pending() into state_pending_*") for details. Relevant for savevm-async and PBS dirty bitmap. * Some block (driver) functions got converted to coroutines, so the Proxmox block drivers needed to be adapted. * Alloc track auto-detaching during PBS live restore got broken by AioContext-related changes resulting in a deadlock. The current, hacky method was replaced by a simpler one. Stefan apparently ran into a problem with that when he wrote the driver, but there were improvements in the stream job code since then and I didn't manage to reproduce the issue. It's a separate patch "alloc-track: fix deadlock during drop" for now, you can find the details there. * Async snapshot-related changes: - The pending querying got adapted to the above-mentioned split and a patch is added to optimize it/make it more similar to what upstream code does. - Added initialization of the compression counters (for future-proofing). - It's necessary the hold the BQL (big QEMU lock = iothread mutex) during the setup phase, because block layer functions are used there and not doing so leads to racy, hard-to-debug crashes or hangs. It's necessary to change some upstream code too for this, a version of the patch "migration: for snapshots, hold the BQL during setup callbacks" is intended to be upstreamed. - Need to take the bdrv graph read lock before flushing. * hmp_info_balloon was moved to a different file. * Needed to include a new headers from time to time to still get the correct functions. Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
191 lines
6.4 KiB
Diff
191 lines
6.4 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Fiona Ebner <f.ebner@proxmox.com>
|
|
Date: Fri, 5 May 2023 13:39:53 +0200
|
|
Subject: [PATCH] migration: for snapshots, hold the BQL during setup callbacks
|
|
|
|
In spirit, this is a partial revert of commit 9b09503752 ("migration:
|
|
run setup callbacks out of big lock"), but only for the snapshot case.
|
|
|
|
For snapshots, the bdrv_writev_vmstate() function is used during setup
|
|
(in QIOChannelBlock backing the QEMUFile), but not holding the BQL
|
|
while calling it could lead to an assertion failure. To understand
|
|
how, first note the following:
|
|
|
|
1. Generated coroutine wrappers for block layer functions spawn the
|
|
coroutine and use AIO_WAIT_WHILE()/aio_poll() to wait for it.
|
|
2. If the host OS switches threads at an inconvenient time, it can
|
|
happen that a bottom half scheduled for the main thread's AioContext
|
|
is executed as part of a vCPU thread's aio_poll().
|
|
|
|
An example leading to the assertion failure is as follows:
|
|
|
|
main thread:
|
|
1. A snapshot-save QMP command gets issued.
|
|
2. snapshot_save_job_bh() is scheduled.
|
|
|
|
vCPU thread:
|
|
3. aio_poll() for the main thread's AioContext is called (e.g. when
|
|
the guest writes to a pflash device, as part of blk_pwrite which is a
|
|
generated coroutine wrapper).
|
|
4. snapshot_save_job_bh() is executed as part of aio_poll().
|
|
3. qemu_savevm_state() is called.
|
|
4. qemu_mutex_unlock_iothread() is called. Now
|
|
qemu_get_current_aio_context() returns 0x0.
|
|
5. bdrv_writev_vmstate() is executed during the usual savevm setup.
|
|
But this function is a generated coroutine wrapper, so it uses
|
|
AIO_WAIT_WHILE. There, the assertion
|
|
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
|
|
will fail.
|
|
|
|
To fix it, ensure that the BQL is held during setup. To avoid changing
|
|
the behavior for migration too, introduce conditionals for the setup
|
|
callbacks that need the BQL and only take the lock if it's not already
|
|
held.
|
|
|
|
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
|
|
---
|
|
include/migration/register.h | 2 +-
|
|
migration/block-dirty-bitmap.c | 15 ++++++++++++---
|
|
migration/block.c | 15 ++++++++++++---
|
|
migration/ram.c | 16 +++++++++++++---
|
|
migration/savevm.c | 2 --
|
|
5 files changed, 38 insertions(+), 12 deletions(-)
|
|
|
|
diff --git a/include/migration/register.h b/include/migration/register.h
|
|
index a8dfd8fefd..fa9b0b0f10 100644
|
|
--- a/include/migration/register.h
|
|
+++ b/include/migration/register.h
|
|
@@ -43,9 +43,9 @@ typedef struct SaveVMHandlers {
|
|
* by other locks.
|
|
*/
|
|
int (*save_live_iterate)(QEMUFile *f, void *opaque);
|
|
+ int (*save_setup)(QEMUFile *f, void *opaque);
|
|
|
|
/* This runs outside the iothread lock! */
|
|
- int (*save_setup)(QEMUFile *f, void *opaque);
|
|
/* Note for save_live_pending:
|
|
* must_precopy:
|
|
* - must be migrated in precopy or in stopped state
|
|
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
|
|
index a6440929fa..69fab3275c 100644
|
|
--- a/migration/block-dirty-bitmap.c
|
|
+++ b/migration/block-dirty-bitmap.c
|
|
@@ -1214,10 +1214,17 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
|
|
{
|
|
DBMSaveState *s = &((DBMState *)opaque)->save;
|
|
SaveBitmapState *dbms = NULL;
|
|
+ bool release_lock = false;
|
|
|
|
- qemu_mutex_lock_iothread();
|
|
+ /* For snapshots, the BQL is held during setup. */
|
|
+ if (!qemu_mutex_iothread_locked()) {
|
|
+ qemu_mutex_lock_iothread();
|
|
+ release_lock = true;
|
|
+ }
|
|
if (init_dirty_bitmap_migration(s) < 0) {
|
|
- qemu_mutex_unlock_iothread();
|
|
+ if (release_lock) {
|
|
+ qemu_mutex_unlock_iothread();
|
|
+ }
|
|
return -1;
|
|
}
|
|
|
|
@@ -1225,7 +1232,9 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
|
|
send_bitmap_start(f, s, dbms);
|
|
}
|
|
qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
|
|
- qemu_mutex_unlock_iothread();
|
|
+ if (release_lock) {
|
|
+ qemu_mutex_unlock_iothread();
|
|
+ }
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/migration/block.c b/migration/block.c
|
|
index b2497bbd32..c9d55be642 100644
|
|
--- a/migration/block.c
|
|
+++ b/migration/block.c
|
|
@@ -716,21 +716,30 @@ static void block_migration_cleanup(void *opaque)
|
|
static int block_save_setup(QEMUFile *f, void *opaque)
|
|
{
|
|
int ret;
|
|
+ bool release_lock = false;
|
|
|
|
trace_migration_block_save("setup", block_mig_state.submitted,
|
|
block_mig_state.transferred);
|
|
|
|
- qemu_mutex_lock_iothread();
|
|
+ /* For snapshots, the BQL is held during setup. */
|
|
+ if (!qemu_mutex_iothread_locked()) {
|
|
+ qemu_mutex_lock_iothread();
|
|
+ release_lock = true;
|
|
+ }
|
|
ret = init_blk_migration(f);
|
|
if (ret < 0) {
|
|
- qemu_mutex_unlock_iothread();
|
|
+ if (release_lock) {
|
|
+ qemu_mutex_unlock_iothread();
|
|
+ }
|
|
return ret;
|
|
}
|
|
|
|
/* start track dirty blocks */
|
|
ret = set_dirty_tracking();
|
|
|
|
- qemu_mutex_unlock_iothread();
|
|
+ if (release_lock) {
|
|
+ qemu_mutex_unlock_iothread();
|
|
+ }
|
|
|
|
if (ret) {
|
|
return ret;
|
|
diff --git a/migration/ram.c b/migration/ram.c
|
|
index 79d881f735..0ecbbc3202 100644
|
|
--- a/migration/ram.c
|
|
+++ b/migration/ram.c
|
|
@@ -3117,8 +3117,16 @@ static void migration_bitmap_clear_discarded_pages(RAMState *rs)
|
|
|
|
static void ram_init_bitmaps(RAMState *rs)
|
|
{
|
|
- /* For memory_global_dirty_log_start below. */
|
|
- qemu_mutex_lock_iothread();
|
|
+ bool release_lock = false;
|
|
+
|
|
+ /*
|
|
+ * For memory_global_dirty_log_start below.
|
|
+ * For snapshots, the BQL is held during setup.
|
|
+ */
|
|
+ if (!qemu_mutex_iothread_locked()) {
|
|
+ qemu_mutex_lock_iothread();
|
|
+ release_lock = true;
|
|
+ }
|
|
qemu_mutex_lock_ramlist();
|
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
|
@@ -3130,7 +3138,9 @@ static void ram_init_bitmaps(RAMState *rs)
|
|
}
|
|
}
|
|
qemu_mutex_unlock_ramlist();
|
|
- qemu_mutex_unlock_iothread();
|
|
+ if (release_lock) {
|
|
+ qemu_mutex_unlock_iothread();
|
|
+ }
|
|
|
|
/*
|
|
* After an eventual first bitmap sync, fixup the initial bitmap
|
|
diff --git a/migration/savevm.c b/migration/savevm.c
|
|
index aa54a67fda..fc6a82a555 100644
|
|
--- a/migration/savevm.c
|
|
+++ b/migration/savevm.c
|
|
@@ -1621,10 +1621,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
|
|
memset(&compression_counters, 0, sizeof(compression_counters));
|
|
ms->to_dst_file = f;
|
|
|
|
- qemu_mutex_unlock_iothread();
|
|
qemu_savevm_state_header(f);
|
|
qemu_savevm_state_setup(f);
|
|
- qemu_mutex_lock_iothread();
|
|
|
|
while (qemu_file_get_error(f) == 0) {
|
|
if (qemu_savevm_state_iterate(f, false) > 0) {
|