pbs cleanup fixes
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
This commit is contained in:
		
							parent
							
								
									53bff441c5
								
							
						
					
					
						commit
						7f4326d1dc
					
				
							
								
								
									
										59
									
								
								debian/patches/pve/0054-PVE-Backup-create-jobs-correctly-cancel-in-error-sce.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								debian/patches/pve/0054-PVE-Backup-create-jobs-correctly-cancel-in-error-sce.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,59 @@ | ||||
| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 | ||||
| From: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Date: Wed, 25 May 2022 13:59:37 +0200 | ||||
| Subject: [PATCH] PVE-Backup: create jobs: correctly cancel in error scenario | ||||
| 
 | ||||
| The first call to job_cancel_sync() will cancel and free all jobs in | ||||
| the transaction, so ensure that it's called only once and get rid of | ||||
| the job_unref() that would operate on freed memory. | ||||
| 
 | ||||
| It's also necessary to NULL backup_state.pbs in the error scenario, | ||||
| because a subsequent backup_cancel QMP call (as happens in PVE when | ||||
| the backup QMP command fails) would try to call proxmox_backup_abort() | ||||
| and run into a segfault. | ||||
| 
 | ||||
| Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com> | ||||
| ---
 | ||||
|  pve-backup.c | 10 ++++++++-- | ||||
|  1 file changed, 8 insertions(+), 2 deletions(-) | ||||
| 
 | ||||
| diff --git a/pve-backup.c b/pve-backup.c
 | ||||
| index f6a5f8c785..5bed6f4014 100644
 | ||||
| --- a/pve-backup.c
 | ||||
| +++ b/pve-backup.c
 | ||||
| @@ -506,6 +506,11 @@ static void create_backup_jobs_bh(void *opaque) {
 | ||||
|      } | ||||
|   | ||||
|      if (*errp) { | ||||
| +        /*
 | ||||
| +         * It's enough to cancel one job in the transaction, the rest will
 | ||||
| +         * follow automatically.
 | ||||
| +         */
 | ||||
| +        bool canceled = false;
 | ||||
|          l = backup_state.di_list; | ||||
|          while (l) { | ||||
|              PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data; | ||||
| @@ -516,12 +521,12 @@ static void create_backup_jobs_bh(void *opaque) {
 | ||||
|                  di->target = NULL; | ||||
|              } | ||||
|   | ||||
| -            if (di->job) {
 | ||||
| +            if (!canceled && di->job) {
 | ||||
|                  AioContext *ctx = di->job->job.aio_context; | ||||
|                  aio_context_acquire(ctx); | ||||
|                  job_cancel_sync(&di->job->job, true); | ||||
| -                job_unref(&di->job->job);
 | ||||
|                  aio_context_release(ctx); | ||||
| +                canceled = true;
 | ||||
|              } | ||||
|          } | ||||
|      } | ||||
| @@ -947,6 +952,7 @@ err:
 | ||||
|   | ||||
|      if (pbs) { | ||||
|          proxmox_backup_disconnect(pbs); | ||||
| +        backup_state.pbs = NULL;
 | ||||
|      } | ||||
|   | ||||
|      if (backup_dir) { | ||||
							
								
								
									
										60
									
								
								debian/patches/pve/0055-PVE-Backup-ensure-jobs-in-di_list-are-referenced.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								debian/patches/pve/0055-PVE-Backup-ensure-jobs-in-di_list-are-referenced.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,60 @@ | ||||
| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 | ||||
| From: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Date: Wed, 25 May 2022 13:59:38 +0200 | ||||
| Subject: [PATCH] PVE-Backup: ensure jobs in di_list are referenced | ||||
| 
 | ||||
| Ensures that qmp_backup_cancel doesn't pick a job that's already been | ||||
| freed. With unlucky timings it seems possible that: | ||||
| 1. job_exit -> job_completed -> job_finalize_single starts | ||||
| 2. pvebackup_co_complete_stream gets spawned in completion callback | ||||
| 3. job finalize_single finishes -> job's refcount hits zero -> job is | ||||
|    freed | ||||
| 4. qmp_backup_cancel comes in and locks backup_state.backup_mutex | ||||
|    before pvebackup_co_complete_stream can remove the job from the | ||||
|    di_list | ||||
| 5. qmp_backup_cancel will pick a job that's already been freed | ||||
| 
 | ||||
| Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com> | ||||
| ---
 | ||||
|  pve-backup.c | 13 +++++++++++++ | ||||
|  1 file changed, 13 insertions(+) | ||||
| 
 | ||||
| diff --git a/pve-backup.c b/pve-backup.c
 | ||||
| index 5bed6f4014..cd45e66a61 100644
 | ||||
| --- a/pve-backup.c
 | ||||
| +++ b/pve-backup.c
 | ||||
| @@ -316,6 +316,11 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
 | ||||
|          } | ||||
|      } | ||||
|   | ||||
| +    if (di->job) {
 | ||||
| +        job_unref(&di->job->job);
 | ||||
| +        di->job = NULL;
 | ||||
| +    }
 | ||||
| +
 | ||||
|      // remove self from job list | ||||
|      backup_state.di_list = g_list_remove(backup_state.di_list, di); | ||||
|   | ||||
| @@ -494,6 +499,9 @@ static void create_backup_jobs_bh(void *opaque) {
 | ||||
|          aio_context_release(aio_context); | ||||
|   | ||||
|          di->job = job; | ||||
| +        if (job) {
 | ||||
| +            job_ref(&job->job);
 | ||||
| +        }
 | ||||
|   | ||||
|          if (!job || local_err) { | ||||
|              error_setg(errp, "backup_job_create failed: %s", | ||||
| @@ -528,6 +536,11 @@ static void create_backup_jobs_bh(void *opaque) {
 | ||||
|                  aio_context_release(ctx); | ||||
|                  canceled = true; | ||||
|              } | ||||
| +
 | ||||
| +            if (di->job) {
 | ||||
| +                job_unref(&di->job->job);
 | ||||
| +                di->job = NULL;
 | ||||
| +            }
 | ||||
|          } | ||||
|      } | ||||
|   | ||||
							
								
								
									
										120
									
								
								debian/patches/pve/0056-PVE-Backup-avoid-segfault-issues-upon-backup-cancel.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								debian/patches/pve/0056-PVE-Backup-avoid-segfault-issues-upon-backup-cancel.patch
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,120 @@ | ||||
| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 | ||||
| From: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Date: Wed, 25 May 2022 13:59:39 +0200 | ||||
| Subject: [PATCH] PVE-Backup: avoid segfault issues upon backup-cancel | ||||
| 
 | ||||
| When canceling a backup in PVE via a signal it's easy to run into a | ||||
| situation where the job is already failing when the backup_cancel QMP | ||||
| command comes in. With a bit of unlucky timing on top, it can happen | ||||
| that job_exit() runs between schedulung of job_cancel_bh() and | ||||
| execution of job_cancel_bh(). But job_cancel_sync() does not expect | ||||
| that the job is already finalized (in fact, the job might've been | ||||
| freed already, but even if it isn't, job_cancel_sync() would try to | ||||
| deref job->txn which would be NULL at that point). | ||||
| 
 | ||||
| It is not possible to simply use the job_cancel() (which is advertised | ||||
| as being async but isn't in all cases) in qmp_backup_cancel() for the | ||||
| same reason job_cancel_sync() cannot be used. Namely, because it can | ||||
| invoke job_finish_sync() (which uses AIO_WAIT_WHILE and thus hangs if | ||||
| called from a coroutine). This happens when there's multiple jobs in | ||||
| the transaction and job->deferred_to_main_loop is true (is set before | ||||
| scheduling job_exit()) or if the job was not started yet. | ||||
| 
 | ||||
| Fix the issue by selecting the job to cancel in job_cancel_bh() itself | ||||
| using the first job that's not completed yet. This is not necessarily | ||||
| the first job in the list, because pvebackup_co_complete_stream() | ||||
| might not yet have removed a completed job when job_cancel_bh() runs. | ||||
| 
 | ||||
| An alternative would be to continue using only the first job and | ||||
| checking against JOB_STATUS_CONCLUDED or JOB_STATUS_NULL to decide if | ||||
| it's still necessary and possible to cancel, but the approach with | ||||
| using the first non-completed job seemed more robust. | ||||
| 
 | ||||
| Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | ||||
| Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com> | ||||
| ---
 | ||||
|  pve-backup.c | 61 +++++++++++++++++++++++++++++++++------------------- | ||||
|  1 file changed, 39 insertions(+), 22 deletions(-) | ||||
| 
 | ||||
| diff --git a/pve-backup.c b/pve-backup.c
 | ||||
| index cd45e66a61..be21027dad 100644
 | ||||
| --- a/pve-backup.c
 | ||||
| +++ b/pve-backup.c
 | ||||
| @@ -352,15 +352,42 @@ static void pvebackup_complete_cb(void *opaque, int ret)
 | ||||
|   | ||||
|  /* | ||||
|   * job_cancel(_sync) does not like to be called from coroutines, so defer to | ||||
| - * main loop processing via a bottom half.
 | ||||
| + * main loop processing via a bottom half. Assumes that caller holds
 | ||||
| + * backup_mutex.
 | ||||
|   */ | ||||
|  static void job_cancel_bh(void *opaque) { | ||||
|      CoCtxData *data = (CoCtxData*)opaque; | ||||
| -    Job *job = (Job*)data->data;
 | ||||
| -    AioContext *job_ctx = job->aio_context;
 | ||||
| -    aio_context_acquire(job_ctx);
 | ||||
| -    job_cancel_sync(job, true);
 | ||||
| -    aio_context_release(job_ctx);
 | ||||
| +
 | ||||
| +    /*
 | ||||
| +     * Be careful to pick a valid job to cancel:
 | ||||
| +     * 1. job_cancel_sync() does not expect the job to be finalized already.
 | ||||
| +     * 2. job_exit() might run between scheduling and running job_cancel_bh()
 | ||||
| +     *    and pvebackup_co_complete_stream() might not have removed the job from
 | ||||
| +     *    the list yet (in fact, cannot, because it waits for the backup_mutex).
 | ||||
| +     * Requiring !job_is_completed() ensures that no finalized job is picked.
 | ||||
| +     */
 | ||||
| +    GList *bdi = g_list_first(backup_state.di_list);
 | ||||
| +    while (bdi) {
 | ||||
| +        if (bdi->data) {
 | ||||
| +            BlockJob *bj = ((PVEBackupDevInfo *)bdi->data)->job;
 | ||||
| +            if (bj) {
 | ||||
| +                Job *job = &bj->job;
 | ||||
| +                if (!job_is_completed(job)) {
 | ||||
| +                    AioContext *job_ctx = job->aio_context;
 | ||||
| +                    aio_context_acquire(job_ctx);
 | ||||
| +                    job_cancel_sync(job, true);
 | ||||
| +                    aio_context_release(job_ctx);
 | ||||
| +                    /*
 | ||||
| +                     * It's enough to cancel one job in the transaction, the
 | ||||
| +                     * rest will follow automatically.
 | ||||
| +                     */
 | ||||
| +                    break;
 | ||||
| +                }
 | ||||
| +            }
 | ||||
| +        }
 | ||||
| +        bdi = g_list_next(bdi);
 | ||||
| +    }
 | ||||
| +
 | ||||
|      aio_co_enter(data->ctx, data->co); | ||||
|  } | ||||
|   | ||||
| @@ -381,22 +408,12 @@ void coroutine_fn qmp_backup_cancel(Error **errp)
 | ||||
|          proxmox_backup_abort(backup_state.pbs, "backup canceled"); | ||||
|      } | ||||
|   | ||||
| -    /* it's enough to cancel one job in the transaction, the rest will follow
 | ||||
| -     * automatically */
 | ||||
| -    GList *bdi = g_list_first(backup_state.di_list);
 | ||||
| -    BlockJob *cancel_job = bdi && bdi->data ?
 | ||||
| -        ((PVEBackupDevInfo *)bdi->data)->job :
 | ||||
| -        NULL;
 | ||||
| -
 | ||||
| -    if (cancel_job) {
 | ||||
| -        CoCtxData data = {
 | ||||
| -            .ctx = qemu_get_current_aio_context(),
 | ||||
| -            .co = qemu_coroutine_self(),
 | ||||
| -            .data = &cancel_job->job,
 | ||||
| -        };
 | ||||
| -        aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
 | ||||
| -        qemu_coroutine_yield();
 | ||||
| -    }
 | ||||
| +    CoCtxData data = {
 | ||||
| +        .ctx = qemu_get_current_aio_context(),
 | ||||
| +        .co = qemu_coroutine_self(),
 | ||||
| +    };
 | ||||
| +    aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
 | ||||
| +    qemu_coroutine_yield();
 | ||||
|   | ||||
|      qemu_co_mutex_unlock(&backup_state.backup_mutex); | ||||
|  } | ||||
							
								
								
									
										3
									
								
								debian/patches/series
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								debian/patches/series
									
									
									
									
										vendored
									
									
								
							| @ -76,3 +76,6 @@ pve/0050-qemu-img-dd-add-l-option-for-loading-a-snapshot.patch | ||||
| pve/0051-vma-allow-partial-restore.patch | ||||
| pve/0052-pbs-namespace-support.patch | ||||
| pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch | ||||
| pve/0054-PVE-Backup-create-jobs-correctly-cancel-in-error-sce.patch | ||||
| pve/0055-PVE-Backup-ensure-jobs-in-di_list-are-referenced.patch | ||||
| pve/0056-PVE-Backup-avoid-segfault-issues-upon-backup-cancel.patch | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Wolfgang Bumiller
						Wolfgang Bumiller