From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Mon, 28 Sep 2020 13:40:51 +0200 Subject: [PATCH] PVE-Backup: Don't block on finishing and cleanup create_backup_jobs proxmox_backup_co_finish is already async, but previously we would wait for the coroutine using block_on_coroutine_fn(). Avoid this by scheduling pvebackup_co_complete_stream (and thus pvebackup_co_cleanup) as a real coroutine when calling from pvebackup_complete_cb. This is ok, since complete_stream uses the backup_mutex internally to synchronize, and other streams can happily continue writing in the meantime anyway. To accomodate, backup_mutex is converted to a CoMutex. This means converting every user to a coroutine. This is not just useful here, but will come in handy once this series[0] is merged, and QMP calls can be yield-able coroutines too. Then we can also finally get rid of block_on_coroutine_fn. Cases of aio_context_acquire/release from within what is now a coroutine are changed to aio_co_reschedule_self, which works since a running coroutine always holds the aio lock for the context it is running in. job_cancel_sync is called from a BH since it can't be run from a coroutine (uses AIO_WAIT_WHILE internally). Same thing for create_backup_jobs, which is converted to a BH too. To communicate the finishing state, a new property is introduced to query-backup: 'finishing'. A new state is explicitly not used, since that would break compatibility with older qemu-server versions. Also fix create_backup_jobs: No more weird bool returns, just the standard "errp" format used everywhere else too. With this, if backup_job_create fails, the error message is actually returned over QMP and can be shown to the user. To facilitate correct cleanup on such an error, we call create_backup_jobs as a bottom half directly from pvebackup_co_prepare. This additionally allows us to actually hold the backup_mutex during operation. Also add a job_cancel_sync before job_unref, since a job must be in STATUS_NULL to be deleted by unref, which could trigger an assert before. [0] https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg03515.html Signed-off-by: Stefan Reiter Signed-off-by: Thomas Lamprecht [add new force parameter to job_cancel_sync calls] Signed-off-by: Fabian Ebner --- pve-backup.c | 217 ++++++++++++++++++++++++++++--------------- qapi/block-core.json | 5 +- 2 files changed, 144 insertions(+), 78 deletions(-) diff --git a/pve-backup.c b/pve-backup.c index 63c686463f..6f05796fad 100644 --- a/pve-backup.c +++ b/pve-backup.c @@ -33,7 +33,9 @@ const char *PBS_BITMAP_NAME = "pbs-incremental-dirty-bitmap"; static struct PVEBackupState { struct { - // Everithing accessed from qmp_backup_query command is protected using lock + // Everything accessed from qmp_backup_query command is protected using + // this lock. Do NOT hold this lock for long times, as it is sometimes + // acquired from coroutines, and thus any wait time may block the guest. QemuMutex lock; Error *error; time_t start_time; @@ -47,20 +49,22 @@ static struct PVEBackupState { size_t reused; size_t zero_bytes; GList *bitmap_list; + bool finishing; + bool starting; } stat; int64_t speed; VmaWriter *vmaw; ProxmoxBackupHandle *pbs; GList *di_list; JobTxn *txn; - QemuMutex backup_mutex; + CoMutex backup_mutex; CoMutex dump_callback_mutex; } backup_state; static void pvebackup_init(void) { qemu_mutex_init(&backup_state.stat.lock); - qemu_mutex_init(&backup_state.backup_mutex); + qemu_co_mutex_init(&backup_state.backup_mutex); qemu_co_mutex_init(&backup_state.dump_callback_mutex); } @@ -72,6 +76,7 @@ typedef struct PVEBackupDevInfo { size_t size; uint64_t block_size; uint8_t dev_id; + int completed_ret; // INT_MAX if not completed char targetfile[PATH_MAX]; BdrvDirtyBitmap *bitmap; BlockDriverState *target; @@ -227,12 +232,12 @@ pvebackup_co_dump_vma_cb( } // assumes the caller holds backup_mutex -static void coroutine_fn pvebackup_co_cleanup(void *unused) +static void coroutine_fn pvebackup_co_cleanup(void) { assert(qemu_in_coroutine()); qemu_mutex_lock(&backup_state.stat.lock); - backup_state.stat.end_time = time(NULL); + backup_state.stat.finishing = true; qemu_mutex_unlock(&backup_state.stat.lock); if (backup_state.vmaw) { @@ -261,35 +266,29 @@ static void coroutine_fn pvebackup_co_cleanup(void *unused) g_list_free(backup_state.di_list); backup_state.di_list = NULL; + + qemu_mutex_lock(&backup_state.stat.lock); + backup_state.stat.end_time = time(NULL); + backup_state.stat.finishing = false; + qemu_mutex_unlock(&backup_state.stat.lock); } -// assumes the caller holds backup_mutex -static void coroutine_fn pvebackup_complete_stream(void *opaque) +static void coroutine_fn pvebackup_co_complete_stream(void *opaque) { PVEBackupDevInfo *di = opaque; + int ret = di->completed_ret; - bool error_or_canceled = pvebackup_error_or_canceled(); - - if (backup_state.vmaw) { - vma_writer_close_stream(backup_state.vmaw, di->dev_id); + qemu_mutex_lock(&backup_state.stat.lock); + bool starting = backup_state.stat.starting; + qemu_mutex_unlock(&backup_state.stat.lock); + if (starting) { + /* in 'starting' state, no tasks have been run yet, meaning we can (and + * must) skip all cleanup, as we don't know what has and hasn't been + * initialized yet. */ + return; } - if (backup_state.pbs && !error_or_canceled) { - Error *local_err = NULL; - proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err); - if (local_err != NULL) { - pvebackup_propagate_error(local_err); - } - } -} - -static void pvebackup_complete_cb(void *opaque, int ret) -{ - assert(!qemu_in_coroutine()); - - PVEBackupDevInfo *di = opaque; - - qemu_mutex_lock(&backup_state.backup_mutex); + qemu_co_mutex_lock(&backup_state.backup_mutex); if (ret < 0) { Error *local_err = NULL; @@ -301,7 +300,19 @@ static void pvebackup_complete_cb(void *opaque, int ret) assert(di->target == NULL); - block_on_coroutine_fn(pvebackup_complete_stream, di); + bool error_or_canceled = pvebackup_error_or_canceled(); + + if (backup_state.vmaw) { + vma_writer_close_stream(backup_state.vmaw, di->dev_id); + } + + if (backup_state.pbs && !error_or_canceled) { + Error *local_err = NULL; + proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err); + if (local_err != NULL) { + pvebackup_propagate_error(local_err); + } + } // remove self from job list backup_state.di_list = g_list_remove(backup_state.di_list, di); @@ -310,21 +321,49 @@ static void pvebackup_complete_cb(void *opaque, int ret) /* call cleanup if we're the last job */ if (!g_list_first(backup_state.di_list)) { - block_on_coroutine_fn(pvebackup_co_cleanup, NULL); + pvebackup_co_cleanup(); } - qemu_mutex_unlock(&backup_state.backup_mutex); + qemu_co_mutex_unlock(&backup_state.backup_mutex); } -static void pvebackup_cancel(void) +static void pvebackup_complete_cb(void *opaque, int ret) { - assert(!qemu_in_coroutine()); + PVEBackupDevInfo *di = opaque; + di->completed_ret = ret; + + /* + * Schedule stream cleanup in async coroutine. close_image and finish might + * take a while, so we can't block on them here. This way it also doesn't + * matter if we're already running in a coroutine or not. + * Note: di is a pointer to an entry in the global backup_state struct, so + * it stays valid. + */ + Coroutine *co = qemu_coroutine_create(pvebackup_co_complete_stream, di); + aio_co_enter(qemu_get_aio_context(), co); +} + +/* + * job_cancel(_sync) does not like to be called from coroutines, so defer to + * main loop processing via a bottom half. + */ +static void job_cancel_bh(void *opaque) { + CoCtxData *data = (CoCtxData*)opaque; + Job *job = (Job*)data->data; + AioContext *job_ctx = job->aio_context; + aio_context_acquire(job_ctx); + job_cancel_sync(job, true); + aio_context_release(job_ctx); + aio_co_enter(data->ctx, data->co); +} +static void coroutine_fn pvebackup_co_cancel(void *opaque) +{ Error *cancel_err = NULL; error_setg(&cancel_err, "backup canceled"); pvebackup_propagate_error(cancel_err); - qemu_mutex_lock(&backup_state.backup_mutex); + qemu_co_mutex_lock(&backup_state.backup_mutex); if (backup_state.vmaw) { /* make sure vma writer does not block anymore */ @@ -342,27 +381,22 @@ static void pvebackup_cancel(void) ((PVEBackupDevInfo *)bdi->data)->job : NULL; - /* ref the job before releasing the mutex, just to be safe */ if (cancel_job) { - job_ref(&cancel_job->job); + CoCtxData data = { + .ctx = qemu_get_current_aio_context(), + .co = qemu_coroutine_self(), + .data = &cancel_job->job, + }; + aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data); + qemu_coroutine_yield(); } - /* job_cancel_sync may enter the job, so we need to release the - * backup_mutex to avoid deadlock */ - qemu_mutex_unlock(&backup_state.backup_mutex); - - if (cancel_job) { - AioContext *aio_context = cancel_job->job.aio_context; - aio_context_acquire(aio_context); - job_cancel_sync(&cancel_job->job, true); - job_unref(&cancel_job->job); - aio_context_release(aio_context); - } + qemu_co_mutex_unlock(&backup_state.backup_mutex); } void qmp_backup_cancel(Error **errp) { - pvebackup_cancel(); + block_on_coroutine_fn(pvebackup_co_cancel, NULL); } // assumes the caller holds backup_mutex @@ -415,10 +449,18 @@ static int coroutine_fn pvebackup_co_add_config( goto out; } -static bool create_backup_jobs(void) { +/* + * backup_job_create can *not* be run from a coroutine (and requires an + * acquired AioContext), so this can't either. + * The caller is responsible that backup_mutex is held nonetheless. + */ +static void create_backup_jobs_bh(void *opaque) { assert(!qemu_in_coroutine()); + CoCtxData *data = (CoCtxData*)opaque; + Error **errp = (Error**)data->data; + Error *local_err = NULL; /* create job transaction to synchronize bitmap commit and cancel all @@ -454,24 +496,19 @@ static bool create_backup_jobs(void) { aio_context_release(aio_context); - if (!job || local_err != NULL) { - Error *create_job_err = NULL; - error_setg(&create_job_err, "backup_job_create failed: %s", - local_err ? error_get_pretty(local_err) : "null"); + di->job = job; - pvebackup_propagate_error(create_job_err); + if (!job || local_err) { + error_setg(errp, "backup_job_create failed: %s", + local_err ? error_get_pretty(local_err) : "null"); break; } - di->job = job; - bdrv_unref(di->target); di->target = NULL; } - bool errors = pvebackup_error_or_canceled(); - - if (errors) { + if (*errp) { l = backup_state.di_list; while (l) { PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data; @@ -483,12 +520,17 @@ static bool create_backup_jobs(void) { } if (di->job) { + AioContext *ctx = di->job->job.aio_context; + aio_context_acquire(ctx); + job_cancel_sync(&di->job->job, true); job_unref(&di->job->job); + aio_context_release(ctx); } } } - return errors; + /* return */ + aio_co_enter(data->ctx, data->co); } typedef struct QmpBackupTask { @@ -525,11 +567,12 @@ typedef struct QmpBackupTask { UuidInfo *result; } QmpBackupTask; -// assumes the caller holds backup_mutex static void coroutine_fn pvebackup_co_prepare(void *opaque) { assert(qemu_in_coroutine()); + qemu_co_mutex_lock(&backup_state.backup_mutex); + QmpBackupTask *task = opaque; task->result = NULL; // just to be sure @@ -550,8 +593,9 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) const char *firewall_name = "qemu-server.fw"; if (backup_state.di_list) { - error_set(task->errp, ERROR_CLASS_GENERIC_ERROR, + error_set(task->errp, ERROR_CLASS_GENERIC_ERROR, "previous backup not finished"); + qemu_co_mutex_unlock(&backup_state.backup_mutex); return; } @@ -618,6 +662,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) } di->size = size; total += size; + + di->completed_ret = INT_MAX; } uuid_generate(uuid); @@ -849,6 +895,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) backup_state.stat.dirty = total - backup_state.stat.reused; backup_state.stat.transferred = 0; backup_state.stat.zero_bytes = 0; + backup_state.stat.finishing = false; + backup_state.stat.starting = true; qemu_mutex_unlock(&backup_state.stat.lock); @@ -863,6 +911,33 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) uuid_info->UUID = uuid_str; task->result = uuid_info; + + /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep + * backup_mutex locked. This is fine, a CoMutex can be held across yield + * points, and we'll release it as soon as the BH reschedules us. + */ + CoCtxData waker = { + .co = qemu_coroutine_self(), + .ctx = qemu_get_current_aio_context(), + .data = &local_err, + }; + aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker); + qemu_coroutine_yield(); + + if (local_err) { + error_propagate(task->errp, local_err); + goto err; + } + + qemu_co_mutex_unlock(&backup_state.backup_mutex); + + qemu_mutex_lock(&backup_state.stat.lock); + backup_state.stat.starting = false; + qemu_mutex_unlock(&backup_state.stat.lock); + + /* start the first job in the transaction */ + job_txn_start_seq(backup_state.txn); + return; err_mutex: @@ -885,6 +960,7 @@ err: g_free(di); } g_list_free(di_list); + backup_state.di_list = NULL; if (devs) { g_strfreev(devs); @@ -905,6 +981,8 @@ err: } task->result = NULL; + + qemu_co_mutex_unlock(&backup_state.backup_mutex); return; } @@ -958,24 +1036,8 @@ UuidInfo *qmp_backup( .errp = errp, }; - qemu_mutex_lock(&backup_state.backup_mutex); - block_on_coroutine_fn(pvebackup_co_prepare, &task); - if (*errp == NULL) { - bool errors = create_backup_jobs(); - qemu_mutex_unlock(&backup_state.backup_mutex); - - if (!errors) { - /* start the first job in the transaction - * note: this might directly enter the job, so we need to do this - * after unlocking the backup_mutex */ - job_txn_start_seq(backup_state.txn); - } - } else { - qemu_mutex_unlock(&backup_state.backup_mutex); - } - return task.result; } @@ -1027,6 +1089,7 @@ BackupStatus *qmp_query_backup(Error **errp) info->transferred = backup_state.stat.transferred; info->has_reused = true; info->reused = backup_state.stat.reused; + info->finishing = backup_state.stat.finishing; qemu_mutex_unlock(&backup_state.stat.lock); diff --git a/qapi/block-core.json b/qapi/block-core.json index 69571d86eb..e6c3687bea 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -774,12 +774,15 @@ # # @uuid: uuid for this backup job # +# @finishing: if status='active' and finishing=true, then the backup process is +# waiting for the target to finish. +# ## { 'struct': 'BackupStatus', 'data': {'*status': 'str', '*errmsg': 'str', '*total': 'int', '*dirty': 'int', '*transferred': 'int', '*zero-bytes': 'int', '*reused': 'int', '*start-time': 'int', '*end-time': 'int', - '*backup-file': 'str', '*uuid': 'str' } } + '*backup-file': 'str', '*uuid': 'str', 'finishing': 'bool' } } ## # @BackupFormat: