From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Dietmar Maurer Date: Mon, 6 Apr 2020 12:16:46 +0200 Subject: [PATCH] PVE: internal snapshot async Truncate at 1024 boundary (Fabian Ebner will send a patch for stable) Put qemu_savevm_state_{header,setup} into the main loop and the rest of the iteration into a coroutine. The former need to lock the iothread (and we can't unlock it in the coroutine), and the latter can't deal with being in a separate thread, so a coroutine it must be. Signed-off-by: Thomas Lamprecht Signed-off-by: Dietmar Maurer Signed-off-by: Wolfgang Bumiller --- Makefile.objs | 1 + hmp-commands-info.hx | 13 + hmp-commands.hx | 32 +++ include/block/aio.h | 10 + include/migration/snapshot.h | 1 + include/monitor/hmp.h | 5 + monitor/hmp-cmds.c | 57 ++++ qapi/migration.json | 34 +++ qapi/misc.json | 32 +++ qemu-options.hx | 12 + savevm-async.c | 542 +++++++++++++++++++++++++++++++++++ softmmu/vl.c | 10 + util/async.c | 30 ++ 13 files changed, 779 insertions(+) create mode 100644 savevm-async.c diff --git a/Makefile.objs b/Makefile.objs index d22b3b45d7..a1307c12a8 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -46,6 +46,7 @@ common-obj-y += bootdevice.o iothread.o common-obj-y += dump/ common-obj-y += job-qmp.o common-obj-y += monitor/ +common-obj-y += savevm-async.o common-obj-y += net/ common-obj-y += qdev-monitor.o common-obj-$(CONFIG_WIN32) += os-win32.o diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 30209e3903..ae8ff21789 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -580,6 +580,19 @@ SRST Show current migration xbzrle cache size. ERST + { + .name = "savevm", + .args_type = "", + .params = "", + .help = "show savevm status", + .cmd = hmp_info_savevm, + }, + +SRST + ``info savevm`` + Show savevm status. +ERST + { .name = "balloon", .args_type = "", diff --git a/hmp-commands.hx b/hmp-commands.hx index 60f395c276..2b58ac4a1c 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1829,3 +1829,35 @@ ERST .flags = "p", }, + + { + .name = "savevm-start", + .args_type = "statefile:s?", + .params = "[statefile]", + .help = "Prepare for snapshot and halt VM. Save VM state to statefile.", + .cmd = hmp_savevm_start, + }, + + { + .name = "snapshot-drive", + .args_type = "device:s,name:s", + .params = "device name", + .help = "Create internal snapshot.", + .cmd = hmp_snapshot_drive, + }, + + { + .name = "delete-drive-snapshot", + .args_type = "device:s,name:s", + .params = "device name", + .help = "Delete internal snapshot.", + .cmd = hmp_delete_drive_snapshot, + }, + + { + .name = "savevm-end", + .args_type = "", + .params = "", + .help = "Resume VM after snaphot.", + .cmd = hmp_savevm_end, + }, diff --git a/include/block/aio.h b/include/block/aio.h index b2f703fa3f..c37617b404 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -17,6 +17,7 @@ #ifdef CONFIG_LINUX_IO_URING #include #endif +#include "qemu/coroutine.h" #include "qemu/queue.h" #include "qemu/event_notifier.h" #include "qemu/thread.h" @@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external) */ void aio_co_schedule(AioContext *ctx, struct Coroutine *co); +/** + * aio_co_reschedule_self: + * @new_ctx: the new context + * + * Move the currently running coroutine to new_ctx. If the coroutine is already + * running in new_ctx, do nothing. + */ +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); + /** * aio_co_wake: * @co: the coroutine diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h index c85b6ec75b..4411b7121d 100644 --- a/include/migration/snapshot.h +++ b/include/migration/snapshot.h @@ -17,5 +17,6 @@ int save_snapshot(const char *name, Error **errp); int load_snapshot(const char *name, Error **errp); +int load_snapshot_from_blockdev(const char *filename, Error **errp); #endif diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h index c986cfd28b..243952d32f 100644 --- a/include/monitor/hmp.h +++ b/include/monitor/hmp.h @@ -25,6 +25,7 @@ void hmp_info_status(Monitor *mon, const QDict *qdict); void hmp_info_uuid(Monitor *mon, const QDict *qdict); void hmp_info_chardev(Monitor *mon, const QDict *qdict); void hmp_info_mice(Monitor *mon, const QDict *qdict); +void hmp_info_savevm(Monitor *mon, const QDict *qdict); void hmp_info_migrate(Monitor *mon, const QDict *qdict); void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict); void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict); @@ -83,6 +84,10 @@ void hmp_netdev_add(Monitor *mon, const QDict *qdict); void hmp_netdev_del(Monitor *mon, const QDict *qdict); void hmp_getfd(Monitor *mon, const QDict *qdict); void hmp_closefd(Monitor *mon, const QDict *qdict); +void hmp_savevm_start(Monitor *mon, const QDict *qdict); +void hmp_snapshot_drive(Monitor *mon, const QDict *qdict); +void hmp_delete_drive_snapshot(Monitor *mon, const QDict *qdict); +void hmp_savevm_end(Monitor *mon, const QDict *qdict); void hmp_sendkey(Monitor *mon, const QDict *qdict); void hmp_screendump(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 6e26ea2cd0..280bb447a6 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -1904,6 +1904,63 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, err); } +void hmp_savevm_start(Monitor *mon, const QDict *qdict) +{ + Error *errp = NULL; + const char *statefile = qdict_get_try_str(qdict, "statefile"); + + qmp_savevm_start(statefile != NULL, statefile, &errp); + hmp_handle_error(mon, errp); +} + +void hmp_snapshot_drive(Monitor *mon, const QDict *qdict) +{ + Error *errp = NULL; + const char *name = qdict_get_str(qdict, "name"); + const char *device = qdict_get_str(qdict, "device"); + + qmp_snapshot_drive(device, name, &errp); + hmp_handle_error(mon, errp); +} + +void hmp_delete_drive_snapshot(Monitor *mon, const QDict *qdict) +{ + Error *errp = NULL; + const char *name = qdict_get_str(qdict, "name"); + const char *device = qdict_get_str(qdict, "device"); + + qmp_delete_drive_snapshot(device, name, &errp); + hmp_handle_error(mon, errp); +} + +void hmp_savevm_end(Monitor *mon, const QDict *qdict) +{ + Error *errp = NULL; + + qmp_savevm_end(&errp); + hmp_handle_error(mon, errp); +} + +void hmp_info_savevm(Monitor *mon, const QDict *qdict) +{ + SaveVMInfo *info; + info = qmp_query_savevm(NULL); + + if (info->has_status) { + monitor_printf(mon, "savevm status: %s\n", info->status); + monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n", + info->total_time); + } else { + monitor_printf(mon, "savevm status: not running\n"); + } + if (info->has_bytes) { + monitor_printf(mon, "Bytes saved: %"PRIu64"\n", info->bytes); + } + if (info->has_error) { + monitor_printf(mon, "Error: %s\n", info->error); + } +} + void hmp_info_iothreads(Monitor *mon, const QDict *qdict) { IOThreadInfoList *info_list = qmp_query_iothreads(NULL); diff --git a/qapi/migration.json b/qapi/migration.json index ea53b23dca..c556257544 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -225,6 +225,40 @@ '*compression': 'CompressionStats', '*socket-address': ['SocketAddress'] } } +## +# @SaveVMInfo: +# +# Information about current migration process. +# +# @status: string describing the current savevm status. +# This can be 'active', 'completed', 'failed'. +# If this field is not returned, no savevm process +# has been initiated +# +# @error: string containing error message is status is failed. +# +# @total-time: total amount of milliseconds since savevm started. +# If savevm has ended, it returns the total save time +# +# @bytes: total amount of data transfered +# +# Since: 1.3 +## +{ 'struct': 'SaveVMInfo', + 'data': {'*status': 'str', '*error': 'str', + '*total-time': 'int', '*bytes': 'int'} } + +## +# @query-savevm: +# +# Returns information about current savevm process. +# +# Returns: @SaveVMInfo +# +# Since: 1.3 +## +{ 'command': 'query-savevm', 'returns': 'SaveVMInfo' } + ## # @query-migrate: # diff --git a/qapi/misc.json b/qapi/misc.json index 44b1fb6fa7..9895899f8b 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -1168,6 +1168,38 @@ ## { 'command': 'query-fdsets', 'returns': ['FdsetInfo'] } +## +# @savevm-start: +# +# Prepare for snapshot and halt VM. Save VM state to statefile. +# +## +{ 'command': 'savevm-start', 'data': { '*statefile': 'str' } } + +## +# @snapshot-drive: +# +# Create an internal drive snapshot. +# +## +{ 'command': 'snapshot-drive', 'data': { 'device': 'str', 'name': 'str' } } + +## +# @delete-drive-snapshot: +# +# Delete a drive snapshot. +# +## +{ 'command': 'delete-drive-snapshot', 'data': { 'device': 'str', 'name': 'str' } } + +## +# @savevm-end: +# +# Resume VM after a snapshot. +# +## +{ 'command': 'savevm-end' } + ## # @AcpiTableOptions: # diff --git a/qemu-options.hx b/qemu-options.hx index 708583b4ce..d32995cc50 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3866,6 +3866,18 @@ SRST Start right away with a saved state (``loadvm`` in monitor) ERST +DEF("loadstate", HAS_ARG, QEMU_OPTION_loadstate, \ + "-loadstate file\n" \ + " start right away with a saved state\n", + QEMU_ARCH_ALL) +SRST +``-loadstate file`` + Start right away with a saved state. This option does not rollback + disk state like @code{loadvm}, so user must make sure that disk + have correct state. @var{file} can be any valid device URL. See the section + for "Device URL Syntax" for more information. +ERST + #ifndef _WIN32 DEF("daemonize", 0, QEMU_OPTION_daemonize, \ "-daemonize daemonize QEMU after initializing\n", QEMU_ARCH_ALL) diff --git a/savevm-async.c b/savevm-async.c new file mode 100644 index 0000000000..f918e18dce --- /dev/null +++ b/savevm-async.c @@ -0,0 +1,542 @@ +#include "qemu/osdep.h" +#include "migration/migration.h" +#include "migration/savevm.h" +#include "migration/snapshot.h" +#include "migration/global_state.h" +#include "migration/ram.h" +#include "migration/qemu-file.h" +#include "sysemu/sysemu.h" +#include "sysemu/runstate.h" +#include "block/block.h" +#include "sysemu/block-backend.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qapi-commands-migration.h" +#include "qapi/qapi-commands-misc.h" +#include "qapi/qapi-commands-block.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" +#include "qemu/rcu.h" + +/* #define DEBUG_SAVEVM_STATE */ + +/* used while emulated sync operation in progress */ +#define NOT_DONE -EINPROGRESS + +#ifdef DEBUG_SAVEVM_STATE +#define DPRINTF(fmt, ...) \ + do { printf("savevm-async: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +enum { + SAVE_STATE_DONE, + SAVE_STATE_ERROR, + SAVE_STATE_ACTIVE, + SAVE_STATE_COMPLETED, + SAVE_STATE_CANCELLED +}; + + +static struct SnapshotState { + BlockBackend *target; + size_t bs_pos; + int state; + Error *error; + Error *blocker; + int saved_vm_running; + QEMUFile *file; + int64_t total_time; + QEMUBH *finalize_bh; + Coroutine *co; +} snap_state; + +SaveVMInfo *qmp_query_savevm(Error **errp) +{ + SaveVMInfo *info = g_malloc0(sizeof(*info)); + struct SnapshotState *s = &snap_state; + + if (s->state != SAVE_STATE_DONE) { + info->has_bytes = true; + info->bytes = s->bs_pos; + switch (s->state) { + case SAVE_STATE_ERROR: + info->has_status = true; + info->status = g_strdup("failed"); + info->has_total_time = true; + info->total_time = s->total_time; + if (s->error) { + info->has_error = true; + info->error = g_strdup(error_get_pretty(s->error)); + } + break; + case SAVE_STATE_ACTIVE: + info->has_status = true; + info->status = g_strdup("active"); + info->has_total_time = true; + info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - s->total_time; + break; + case SAVE_STATE_COMPLETED: + info->has_status = true; + info->status = g_strdup("completed"); + info->has_total_time = true; + info->total_time = s->total_time; + break; + } + } + + return info; +} + +static int save_snapshot_cleanup(void) +{ + int ret = 0; + + DPRINTF("save_snapshot_cleanup\n"); + + snap_state.total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - + snap_state.total_time; + + if (snap_state.file) { + ret = qemu_fclose(snap_state.file); + } + + if (snap_state.target) { + /* try to truncate, but ignore errors (will fail on block devices). + * note1: bdrv_read() need whole blocks, so we need to round up + * note2: PVE requires 1024 (BDRV_SECTOR_SIZE*2) alignment + */ + size_t size = QEMU_ALIGN_UP(snap_state.bs_pos, BDRV_SECTOR_SIZE*2); + blk_truncate(snap_state.target, size, false, PREALLOC_MODE_OFF, 0, NULL); + blk_op_unblock_all(snap_state.target, snap_state.blocker); + error_free(snap_state.blocker); + snap_state.blocker = NULL; + blk_unref(snap_state.target); + snap_state.target = NULL; + } + + return ret; +} + +static void save_snapshot_error(const char *fmt, ...) +{ + va_list ap; + char *msg; + + va_start(ap, fmt); + msg = g_strdup_vprintf(fmt, ap); + va_end(ap); + + DPRINTF("save_snapshot_error: %s\n", msg); + + if (!snap_state.error) { + error_set(&snap_state.error, ERROR_CLASS_GENERIC_ERROR, "%s", msg); + } + + g_free (msg); + + snap_state.state = SAVE_STATE_ERROR; +} + +static int block_state_close(void *opaque, Error **errp) +{ + snap_state.file = NULL; + return blk_flush(snap_state.target); +} + +typedef struct BlkRwCo { + int64_t offset; + QEMUIOVector *qiov; + ssize_t ret; +} BlkRwCo; + +static void coroutine_fn block_state_write_entry(void *opaque) { + BlkRwCo *rwco = opaque; + rwco->ret = blk_co_pwritev(snap_state.target, rwco->offset, rwco->qiov->size, + rwco->qiov, 0); + aio_wait_kick(); +} + +static ssize_t block_state_writev_buffer(void *opaque, struct iovec *iov, + int iovcnt, int64_t pos, Error **errp) +{ + QEMUIOVector qiov; + BlkRwCo rwco; + + assert(pos == snap_state.bs_pos); + rwco = (BlkRwCo) { + .offset = pos, + .qiov = &qiov, + .ret = NOT_DONE, + }; + + qemu_iovec_init_external(&qiov, iov, iovcnt); + + if (qemu_in_coroutine()) { + block_state_write_entry(&rwco); + } else { + Coroutine *co = qemu_coroutine_create(&block_state_write_entry, &rwco); + bdrv_coroutine_enter(blk_bs(snap_state.target), co); + BDRV_POLL_WHILE(blk_bs(snap_state.target), rwco.ret == NOT_DONE); + } + if (rwco.ret < 0) { + return rwco.ret; + } + + snap_state.bs_pos += qiov.size; + return qiov.size; +} + +static const QEMUFileOps block_file_ops = { + .writev_buffer = block_state_writev_buffer, + .close = block_state_close, +}; + +static void process_savevm_finalize(void *opaque) +{ + int ret; + AioContext *iohandler_ctx = iohandler_get_aio_context(); + MigrationState *ms = migrate_get_current(); + +#ifdef DEBUG_SAVEVM_STATE + int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +#endif + + qemu_bh_delete(snap_state.finalize_bh); + snap_state.finalize_bh = NULL; + snap_state.co = NULL; + + /* We need to own the target bdrv's context for the following functions, + * so move it back. It can stay in the main context and live out its live + * there, since we're done with it after this method ends anyway. + */ + aio_context_acquire(iohandler_ctx); + blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL); + aio_context_release(iohandler_ctx); + + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret < 0) { + save_snapshot_error("vm_stop_force_state error %d", ret); + } + + (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false); + ret = qemu_file_get_error(snap_state.file); + if (ret < 0) { + save_snapshot_error("qemu_savevm_state_iterate error %d", ret); + } + + DPRINTF("state saving complete\n"); + DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n", + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); + + /* clear migration state */ + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, + ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED); + ms->to_dst_file = NULL; + + qemu_savevm_state_cleanup(); + + ret = save_snapshot_cleanup(); + if (ret < 0) { + save_snapshot_error("save_snapshot_cleanup error %d", ret); + } else if (snap_state.state == SAVE_STATE_ACTIVE) { + snap_state.state = SAVE_STATE_COMPLETED; + } else { + save_snapshot_error("process_savevm_cleanup: invalid state: %d", + snap_state.state); + } + if (snap_state.saved_vm_running) { + vm_start(); + snap_state.saved_vm_running = false; + } + + DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n", + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); +} + +static void coroutine_fn process_savevm_co(void *opaque) +{ + int ret; + int64_t maxlen; + BdrvNextIterator it; + BlockDriverState *bs = NULL; + +#ifdef DEBUG_SAVEVM_STATE + int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +#endif + + ret = qemu_file_get_error(snap_state.file); + if (ret < 0) { + save_snapshot_error("qemu_savevm_state_setup failed"); + return; + } + + while (snap_state.state == SAVE_STATE_ACTIVE) { + uint64_t pending_size, pend_precopy, pend_compatible, pend_postcopy; + + qemu_savevm_state_pending(snap_state.file, 0, &pend_precopy, &pend_compatible, &pend_postcopy); + pending_size = pend_precopy + pend_compatible + pend_postcopy; + + maxlen = blk_getlength(snap_state.target) - 30*1024*1024; + + if (pending_size > 400000 && snap_state.bs_pos + pending_size < maxlen) { + ret = qemu_savevm_state_iterate(snap_state.file, false); + if (ret < 0) { + save_snapshot_error("qemu_savevm_state_iterate error %d", ret); + break; + } + DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret); + } else { + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + ret = global_state_store(); + if (ret) { + save_snapshot_error("global_state_store error %d", ret); + break; + } + + DPRINTF("savevm iterate complete\n"); + break; + } + } + + DPRINTF("timing: process_savevm_co took %ld ms\n", + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time); + +#ifdef DEBUG_SAVEVM_STATE + int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +#endif + /* If a drive runs in an IOThread we can flush it async, and only + * need to sync-flush whatever IO happens between now and + * vm_stop_force_state. bdrv_next can only be called from main AioContext, + * so move there now and after every flush. + */ + aio_co_reschedule_self(qemu_get_aio_context()); + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */ + if (bs == blk_bs(snap_state.target)) { + continue; + } + + AioContext *bs_ctx = bdrv_get_aio_context(bs); + if (bs_ctx != qemu_get_aio_context()) { + DPRINTF("savevm: async flushing drive %s\n", bs->filename); + aio_co_reschedule_self(bs_ctx); + bdrv_flush(bs); + aio_co_reschedule_self(qemu_get_aio_context()); + } + } + + DPRINTF("timing: async flushing took %ld ms\n", + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush); + + qemu_bh_schedule(snap_state.finalize_bh); +} + +void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp) +{ + Error *local_err = NULL; + MigrationState *ms = migrate_get_current(); + AioContext *iohandler_ctx = iohandler_get_aio_context(); + + int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH; + + if (snap_state.state != SAVE_STATE_DONE) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, + "VM snapshot already started\n"); + return; + } + + if (migration_is_running(ms->state)) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, QERR_MIGRATION_ACTIVE); + return; + } + + if (migrate_use_block()) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, + "Block migration and snapshots are incompatible"); + return; + } + + /* initialize snapshot info */ + snap_state.saved_vm_running = runstate_is_running(); + snap_state.bs_pos = 0; + snap_state.total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + snap_state.blocker = NULL; + + if (snap_state.error) { + error_free(snap_state.error); + snap_state.error = NULL; + } + + if (!has_statefile) { + vm_stop(RUN_STATE_SAVE_VM); + snap_state.state = SAVE_STATE_COMPLETED; + return; + } + + if (qemu_savevm_state_blocked(errp)) { + return; + } + + /* Open the image */ + QDict *options = NULL; + options = qdict_new(); + qdict_put_str(options, "driver", "raw"); + snap_state.target = blk_new_open(statefile, NULL, options, bdrv_oflags, &local_err); + if (!snap_state.target) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, "failed to open '%s'", statefile); + goto restart; + } + + snap_state.file = qemu_fopen_ops(&snap_state, &block_file_ops); + + if (!snap_state.file) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, "failed to open '%s'", statefile); + goto restart; + } + + /* + * qemu_savevm_* paths use migration code and expect a migration state. + * State is cleared in process_savevm_co, but has to be initialized + * here (blocking main thread, from QMP) to avoid race conditions. + */ + migrate_init(ms); + memset(&ram_counters, 0, sizeof(ram_counters)); + ms->to_dst_file = snap_state.file; + + error_setg(&snap_state.blocker, "block device is in use by savevm"); + blk_op_block_all(snap_state.target, snap_state.blocker); + + snap_state.state = SAVE_STATE_ACTIVE; + snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state); + snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL); + qemu_mutex_unlock_iothread(); + qemu_savevm_state_header(snap_state.file); + qemu_savevm_state_setup(snap_state.file); + qemu_mutex_lock_iothread(); + + /* Async processing from here on out happens in iohandler context, so let + * the target bdrv have its home there. + */ + blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err); + + aio_co_schedule(iohandler_ctx, snap_state.co); + + return; + +restart: + + save_snapshot_error("setup failed"); + + if (snap_state.saved_vm_running) { + vm_start(); + } +} + +void qmp_savevm_end(Error **errp) +{ + if (snap_state.state == SAVE_STATE_DONE) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, + "VM snapshot not started\n"); + return; + } + + if (snap_state.state == SAVE_STATE_ACTIVE) { + snap_state.state = SAVE_STATE_CANCELLED; + return; + } + + if (snap_state.saved_vm_running) { + vm_start(); + } + + snap_state.state = SAVE_STATE_DONE; +} + +// FIXME: Deprecated +void qmp_snapshot_drive(const char *device, const char *name, Error **errp) +{ + // Compatibility to older qemu-server. + qmp_blockdev_snapshot_internal_sync(device, name, errp); +} + +// FIXME: Deprecated +void qmp_delete_drive_snapshot(const char *device, const char *name, + Error **errp) +{ + // Compatibility to older qemu-server. + (void)qmp_blockdev_snapshot_delete_internal_sync(device, false, NULL, + true, name, errp); +} + +static ssize_t loadstate_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size, Error **errp) +{ + BlockBackend *be = opaque; + int64_t maxlen = blk_getlength(be); + if (pos > maxlen) { + return -EIO; + } + if ((pos + size) > maxlen) { + size = maxlen - pos - 1; + } + if (size == 0) { + return 0; + } + return blk_pread(be, pos, buf, size); +} + +static const QEMUFileOps loadstate_file_ops = { + .get_buffer = loadstate_get_buffer, +}; + +int load_snapshot_from_blockdev(const char *filename, Error **errp) +{ + BlockBackend *be; + Error *local_err = NULL; + Error *blocker = NULL; + + QEMUFile *f; + int ret = -EINVAL; + + be = blk_new_open(filename, NULL, NULL, 0, &local_err); + + if (!be) { + error_setg(errp, "Could not open VM state file"); + goto the_end; + } + + error_setg(&blocker, "block device is in use by load state"); + blk_op_block_all(be, blocker); + + /* restore the VM state */ + f = qemu_fopen_ops(be, &loadstate_file_ops); + if (!f) { + error_setg(errp, "Could not open VM state file"); + goto the_end; + } + + qemu_system_reset(SHUTDOWN_CAUSE_NONE); + ret = qemu_loadvm_state(f); + + qemu_fclose(f); + migration_incoming_state_destroy(); + if (ret < 0) { + error_setg_errno(errp, -ret, "Error while loading VM state"); + goto the_end; + } + + ret = 0; + + the_end: + if (be) { + blk_op_unblock_all(be, blocker); + error_free(blocker); + blk_unref(be); + } + return ret; +} diff --git a/softmmu/vl.c b/softmmu/vl.c index 4eb9d1f7fd..670b7e427c 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -2844,6 +2844,7 @@ void qemu_init(int argc, char **argv, char **envp) int optind; const char *optarg; const char *loadvm = NULL; + const char *loadstate = NULL; MachineClass *machine_class; const char *cpu_option; const char *vga_model = NULL; @@ -3408,6 +3409,9 @@ void qemu_init(int argc, char **argv, char **envp) case QEMU_OPTION_loadvm: loadvm = optarg; break; + case QEMU_OPTION_loadstate: + loadstate = optarg; + break; case QEMU_OPTION_full_screen: dpy.has_full_screen = true; dpy.full_screen = true; @@ -4464,6 +4468,12 @@ void qemu_init(int argc, char **argv, char **envp) autostart = 0; exit(1); } + } else if (loadstate) { + Error *local_err = NULL; + if (load_snapshot_from_blockdev(loadstate, &local_err) < 0) { + error_report_err(local_err); + autostart = 0; + } } if (replay_mode != REPLAY_MODE_NONE) { replay_vmstate_init(); diff --git a/util/async.c b/util/async.c index 1319eee3bc..b68e73f488 100644 --- a/util/async.c +++ b/util/async.c @@ -559,6 +559,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co) aio_context_unref(ctx); } +typedef struct AioCoRescheduleSelf { + Coroutine *co; + AioContext *new_ctx; +} AioCoRescheduleSelf; + +static void aio_co_reschedule_self_bh(void *opaque) +{ + AioCoRescheduleSelf *data = opaque; + aio_co_schedule(data->new_ctx, data->co); +} + +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx) +{ + AioContext *old_ctx = qemu_get_current_aio_context(); + + if (old_ctx != new_ctx) { + AioCoRescheduleSelf data = { + .co = qemu_coroutine_self(), + .new_ctx = new_ctx, + }; + /* + * We can't directly schedule the coroutine in the target context + * because this would be racy: The other thread could try to enter the + * coroutine before it has yielded in this one. + */ + aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data); + qemu_coroutine_yield(); + } +} + void aio_co_wake(struct Coroutine *co) { AioContext *ctx;