From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Thu, 22 Oct 2020 17:01:07 +0200 Subject: [PATCH] PVE: fix and clean up error handling for create_backup_jobs No more weird bool returns, just the standard "errp" format used everywhere else too. With this, if backup_job_create fails, the error message is actually returned over QMP and can be shown to the user. To facilitate correct cleanup on such an error, we call create_backup_jobs as a bottom half directly from pvebackup_co_prepare. This additionally allows us to actually hold the backup_mutex during operation. Also add a job_cancel_sync before job_unref, since a job must be in STATUS_NULL to be deleted by unref, which could trigger an assert before. Signed-off-by: Stefan Reiter --- pve-backup.c | 79 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/pve-backup.c b/pve-backup.c index 4402c0cb17..c7cde0fb0e 100644 --- a/pve-backup.c +++ b/pve-backup.c @@ -50,6 +50,7 @@ static struct PVEBackupState { size_t zero_bytes; GList *bitmap_list; bool finishing; + bool starting; } stat; int64_t speed; VmaWriter *vmaw; @@ -277,6 +278,16 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque) PVEBackupDevInfo *di = opaque; int ret = di->completed_ret; + qemu_mutex_lock(&backup_state.stat.lock); + bool starting = backup_state.stat.starting; + qemu_mutex_unlock(&backup_state.stat.lock); + if (starting) { + /* in 'starting' state, no tasks have been run yet, meaning we can (and + * must) skip all cleanup, as we don't know what has and hasn't been + * initialized yet. */ + return; + } + qemu_co_mutex_lock(&backup_state.backup_mutex); if (ret < 0) { @@ -441,15 +452,15 @@ static int coroutine_fn pvebackup_co_add_config( /* * backup_job_create can *not* be run from a coroutine (and requires an * acquired AioContext), so this can't either. - * This does imply that this function cannot run with backup_mutex acquired. - * That is ok because it is only ever called between setting up the backup_state - * struct and starting the jobs, and from within a QMP call. This means that no - * other QMP call can interrupt, and no background job is running yet. + * The caller is responsible that backup_mutex is held nonetheless. */ -static bool create_backup_jobs(void) { +static void create_backup_jobs_bh(void *opaque) { assert(!qemu_in_coroutine()); + CoCtxData *data = (CoCtxData*)opaque; + Error **errp = (Error**)data->data; + Error *local_err = NULL; /* create job transaction to synchronize bitmap commit and cancel all @@ -483,24 +494,19 @@ static bool create_backup_jobs(void) { aio_context_release(aio_context); - if (!job || local_err != NULL) { - Error *create_job_err = NULL; - error_setg(&create_job_err, "backup_job_create failed: %s", - local_err ? error_get_pretty(local_err) : "null"); + di->job = job; - pvebackup_propagate_error(create_job_err); + if (!job || local_err) { + error_setg(errp, "backup_job_create failed: %s", + local_err ? error_get_pretty(local_err) : "null"); break; } - di->job = job; - bdrv_unref(di->target); di->target = NULL; } - bool errors = pvebackup_error_or_canceled(); - - if (errors) { + if (*errp) { l = backup_state.di_list; while (l) { PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data; @@ -512,12 +518,17 @@ static bool create_backup_jobs(void) { } if (di->job) { + AioContext *ctx = di->job->job.aio_context; + aio_context_acquire(ctx); + job_cancel_sync(&di->job->job); job_unref(&di->job->job); + aio_context_release(ctx); } } } - return errors; + /* return */ + aio_co_enter(data->ctx, data->co); } typedef struct QmpBackupTask { @@ -883,6 +894,7 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) backup_state.stat.transferred = 0; backup_state.stat.zero_bytes = 0; backup_state.stat.finishing = false; + backup_state.stat.starting = true; qemu_mutex_unlock(&backup_state.stat.lock); @@ -898,7 +910,32 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) task->result = uuid_info; + /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep + * backup_mutex locked. This is fine, a CoMutex can be held across yield + * points, and we'll release it as soon as the BH reschedules us. + */ + CoCtxData waker = { + .co = qemu_coroutine_self(), + .ctx = qemu_get_current_aio_context(), + .data = &local_err, + }; + aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker); + qemu_coroutine_yield(); + + if (local_err) { + error_propagate(task->errp, local_err); + goto err; + } + qemu_co_mutex_unlock(&backup_state.backup_mutex); + + qemu_mutex_lock(&backup_state.stat.lock); + backup_state.stat.starting = false; + qemu_mutex_unlock(&backup_state.stat.lock); + + /* start the first job in the transaction */ + job_txn_start_seq(backup_state.txn); + return; err_mutex: @@ -921,6 +958,7 @@ err: g_free(di); } g_list_free(di_list); + backup_state.di_list = NULL; if (devs) { g_strfreev(devs); @@ -998,15 +1036,6 @@ UuidInfo *qmp_backup( block_on_coroutine_fn(pvebackup_co_prepare, &task); - if (*errp == NULL) { - bool errors = create_backup_jobs(); - - if (!errors) { - // start the first job in the transaction - job_txn_start_seq(backup_state.txn); - } - } - return task.result; }