32ee41155b
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com> bump build-dependency on libproxmox-backup-qemu0-dev with version query support Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
188 lines
5.9 KiB
Diff
188 lines
5.9 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Stefan Reiter <s.reiter@proxmox.com>
|
|
Date: Thu, 22 Oct 2020 17:01:07 +0200
|
|
Subject: [PATCH] PVE: fix and clean up error handling for create_backup_jobs
|
|
|
|
No more weird bool returns, just the standard "errp" format used
|
|
everywhere else too. With this, if backup_job_create fails, the error
|
|
message is actually returned over QMP and can be shown to the user.
|
|
|
|
To facilitate correct cleanup on such an error, we call
|
|
create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
|
|
This additionally allows us to actually hold the backup_mutex during
|
|
operation.
|
|
|
|
Also add a job_cancel_sync before job_unref, since a job must be in
|
|
STATUS_NULL to be deleted by unref, which could trigger an assert
|
|
before.
|
|
|
|
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
|
|
---
|
|
pve-backup.c | 79 +++++++++++++++++++++++++++++++++++-----------------
|
|
1 file changed, 54 insertions(+), 25 deletions(-)
|
|
|
|
diff --git a/pve-backup.c b/pve-backup.c
|
|
index 4402c0cb17..c7cde0fb0e 100644
|
|
--- a/pve-backup.c
|
|
+++ b/pve-backup.c
|
|
@@ -50,6 +50,7 @@ static struct PVEBackupState {
|
|
size_t zero_bytes;
|
|
GList *bitmap_list;
|
|
bool finishing;
|
|
+ bool starting;
|
|
} stat;
|
|
int64_t speed;
|
|
VmaWriter *vmaw;
|
|
@@ -277,6 +278,16 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
|
|
PVEBackupDevInfo *di = opaque;
|
|
int ret = di->completed_ret;
|
|
|
|
+ qemu_mutex_lock(&backup_state.stat.lock);
|
|
+ bool starting = backup_state.stat.starting;
|
|
+ qemu_mutex_unlock(&backup_state.stat.lock);
|
|
+ if (starting) {
|
|
+ /* in 'starting' state, no tasks have been run yet, meaning we can (and
|
|
+ * must) skip all cleanup, as we don't know what has and hasn't been
|
|
+ * initialized yet. */
|
|
+ return;
|
|
+ }
|
|
+
|
|
qemu_co_mutex_lock(&backup_state.backup_mutex);
|
|
|
|
if (ret < 0) {
|
|
@@ -441,15 +452,15 @@ static int coroutine_fn pvebackup_co_add_config(
|
|
/*
|
|
* backup_job_create can *not* be run from a coroutine (and requires an
|
|
* acquired AioContext), so this can't either.
|
|
- * This does imply that this function cannot run with backup_mutex acquired.
|
|
- * That is ok because it is only ever called between setting up the backup_state
|
|
- * struct and starting the jobs, and from within a QMP call. This means that no
|
|
- * other QMP call can interrupt, and no background job is running yet.
|
|
+ * The caller is responsible that backup_mutex is held nonetheless.
|
|
*/
|
|
-static bool create_backup_jobs(void) {
|
|
+static void create_backup_jobs_bh(void *opaque) {
|
|
|
|
assert(!qemu_in_coroutine());
|
|
|
|
+ CoCtxData *data = (CoCtxData*)opaque;
|
|
+ Error **errp = (Error**)data->data;
|
|
+
|
|
Error *local_err = NULL;
|
|
|
|
/* create job transaction to synchronize bitmap commit and cancel all
|
|
@@ -483,24 +494,19 @@ static bool create_backup_jobs(void) {
|
|
|
|
aio_context_release(aio_context);
|
|
|
|
- if (!job || local_err != NULL) {
|
|
- Error *create_job_err = NULL;
|
|
- error_setg(&create_job_err, "backup_job_create failed: %s",
|
|
- local_err ? error_get_pretty(local_err) : "null");
|
|
+ di->job = job;
|
|
|
|
- pvebackup_propagate_error(create_job_err);
|
|
+ if (!job || local_err) {
|
|
+ error_setg(errp, "backup_job_create failed: %s",
|
|
+ local_err ? error_get_pretty(local_err) : "null");
|
|
break;
|
|
}
|
|
|
|
- di->job = job;
|
|
-
|
|
bdrv_unref(di->target);
|
|
di->target = NULL;
|
|
}
|
|
|
|
- bool errors = pvebackup_error_or_canceled();
|
|
-
|
|
- if (errors) {
|
|
+ if (*errp) {
|
|
l = backup_state.di_list;
|
|
while (l) {
|
|
PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
|
|
@@ -512,12 +518,17 @@ static bool create_backup_jobs(void) {
|
|
}
|
|
|
|
if (di->job) {
|
|
+ AioContext *ctx = di->job->job.aio_context;
|
|
+ aio_context_acquire(ctx);
|
|
+ job_cancel_sync(&di->job->job);
|
|
job_unref(&di->job->job);
|
|
+ aio_context_release(ctx);
|
|
}
|
|
}
|
|
}
|
|
|
|
- return errors;
|
|
+ /* return */
|
|
+ aio_co_enter(data->ctx, data->co);
|
|
}
|
|
|
|
typedef struct QmpBackupTask {
|
|
@@ -883,6 +894,7 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
|
|
backup_state.stat.transferred = 0;
|
|
backup_state.stat.zero_bytes = 0;
|
|
backup_state.stat.finishing = false;
|
|
+ backup_state.stat.starting = true;
|
|
|
|
qemu_mutex_unlock(&backup_state.stat.lock);
|
|
|
|
@@ -898,7 +910,32 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
|
|
|
|
task->result = uuid_info;
|
|
|
|
+ /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
|
|
+ * backup_mutex locked. This is fine, a CoMutex can be held across yield
|
|
+ * points, and we'll release it as soon as the BH reschedules us.
|
|
+ */
|
|
+ CoCtxData waker = {
|
|
+ .co = qemu_coroutine_self(),
|
|
+ .ctx = qemu_get_current_aio_context(),
|
|
+ .data = &local_err,
|
|
+ };
|
|
+ aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
|
|
+ qemu_coroutine_yield();
|
|
+
|
|
+ if (local_err) {
|
|
+ error_propagate(task->errp, local_err);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
qemu_co_mutex_unlock(&backup_state.backup_mutex);
|
|
+
|
|
+ qemu_mutex_lock(&backup_state.stat.lock);
|
|
+ backup_state.stat.starting = false;
|
|
+ qemu_mutex_unlock(&backup_state.stat.lock);
|
|
+
|
|
+ /* start the first job in the transaction */
|
|
+ job_txn_start_seq(backup_state.txn);
|
|
+
|
|
return;
|
|
|
|
err_mutex:
|
|
@@ -921,6 +958,7 @@ err:
|
|
g_free(di);
|
|
}
|
|
g_list_free(di_list);
|
|
+ backup_state.di_list = NULL;
|
|
|
|
if (devs) {
|
|
g_strfreev(devs);
|
|
@@ -998,15 +1036,6 @@ UuidInfo *qmp_backup(
|
|
|
|
block_on_coroutine_fn(pvebackup_co_prepare, &task);
|
|
|
|
- if (*errp == NULL) {
|
|
- bool errors = create_backup_jobs();
|
|
-
|
|
- if (!errors) {
|
|
- // start the first job in the transaction
|
|
- job_txn_start_seq(backup_state.txn);
|
|
- }
|
|
- }
|
|
-
|
|
return task.result;
|
|
}
|
|
|