bf251437e9
Many changes were necessary this time around: * QAPI was changed to avoid redundant has_* variables, see commit 44ea9d9be3 ("qapi: Start to elide redundant has_FOO in generated C") for details. This affected many QMP commands added by Proxmox too. * Pending querying for migration got split into two functions, one to estimate, one for exact value, see commit c8df4a7aef ("migration: Split save_live_pending() into state_pending_*") for details. Relevant for savevm-async and PBS dirty bitmap. * Some block (driver) functions got converted to coroutines, so the Proxmox block drivers needed to be adapted. * Alloc track auto-detaching during PBS live restore got broken by AioContext-related changes resulting in a deadlock. The current, hacky method was replaced by a simpler one. Stefan apparently ran into a problem with that when he wrote the driver, but there were improvements in the stream job code since then and I didn't manage to reproduce the issue. It's a separate patch "alloc-track: fix deadlock during drop" for now, you can find the details there. * Async snapshot-related changes: - The pending querying got adapted to the above-mentioned split and a patch is added to optimize it/make it more similar to what upstream code does. - Added initialization of the compression counters (for future-proofing). - It's necessary the hold the BQL (big QEMU lock = iothread mutex) during the setup phase, because block layer functions are used there and not doing so leads to racy, hard-to-debug crashes or hangs. It's necessary to change some upstream code too for this, a version of the patch "migration: for snapshots, hold the BQL during setup callbacks" is intended to be upstreamed. - Need to take the bdrv graph read lock before flushing. * hmp_info_balloon was moved to a different file. * Needed to include a new headers from time to time to still get the correct functions. Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
119 lines
5.0 KiB
Diff
119 lines
5.0 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Alexander Bulekov <alxndr@bu.edu>
|
|
Date: Sat, 4 Feb 2023 23:07:34 -0500
|
|
Subject: [PATCH] memory: prevent dma-reentracy issues
|
|
|
|
Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
|
|
This flag is set/checked prior to calling a device's MemoryRegion
|
|
handlers, and set when device code initiates DMA. The purpose of this
|
|
flag is to prevent two types of DMA-based reentrancy issues:
|
|
|
|
1.) mmio -> dma -> mmio case
|
|
2.) bh -> dma write -> mmio case
|
|
|
|
These issues have led to problems such as stack-exhaustion and
|
|
use-after-frees.
|
|
|
|
Summary of the problem from Peter Maydell:
|
|
https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com
|
|
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282
|
|
|
|
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
|
|
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
|
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
|
|
Acked-by: Peter Xu <peterx@redhat.com>
|
|
(picked-up from https://lists.nongnu.org/archive/html/qemu-devel/2023-02/msg01142.html)
|
|
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
|
|
---
|
|
include/hw/qdev-core.h | 7 +++++++
|
|
softmmu/memory.c | 17 +++++++++++++++++
|
|
softmmu/trace-events | 1 +
|
|
3 files changed, 25 insertions(+)
|
|
|
|
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
|
|
index bd50ad5ee1..7623703943 100644
|
|
--- a/include/hw/qdev-core.h
|
|
+++ b/include/hw/qdev-core.h
|
|
@@ -162,6 +162,10 @@ struct NamedClockList {
|
|
QLIST_ENTRY(NamedClockList) node;
|
|
};
|
|
|
|
+typedef struct {
|
|
+ bool engaged_in_io;
|
|
+} MemReentrancyGuard;
|
|
+
|
|
/**
|
|
* DeviceState:
|
|
* @realized: Indicates whether the device has been fully constructed.
|
|
@@ -194,6 +198,9 @@ struct DeviceState {
|
|
int alias_required_for_version;
|
|
ResettableState reset;
|
|
GSList *unplug_blockers;
|
|
+
|
|
+ /* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
|
|
+ MemReentrancyGuard mem_reentrancy_guard;
|
|
};
|
|
|
|
struct DeviceListener {
|
|
diff --git a/softmmu/memory.c b/softmmu/memory.c
|
|
index b1a6cae6f5..e4d2268d32 100644
|
|
--- a/softmmu/memory.c
|
|
+++ b/softmmu/memory.c
|
|
@@ -533,6 +533,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
|
|
uint64_t access_mask;
|
|
unsigned access_size;
|
|
unsigned i;
|
|
+ DeviceState *dev = NULL;
|
|
MemTxResult r = MEMTX_OK;
|
|
|
|
if (!access_size_min) {
|
|
@@ -542,6 +543,19 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
|
|
access_size_max = 4;
|
|
}
|
|
|
|
+ /* Do not allow more than one simultanous access to a device's IO Regions */
|
|
+ if (mr->owner &&
|
|
+ !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
|
|
+ dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
|
|
+ if (dev) {
|
|
+ if (dev->mem_reentrancy_guard.engaged_in_io) {
|
|
+ trace_memory_region_reentrant_io(get_cpu_index(), mr, addr, size);
|
|
+ return MEMTX_ERROR;
|
|
+ }
|
|
+ dev->mem_reentrancy_guard.engaged_in_io = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* FIXME: support unaligned access? */
|
|
access_size = MAX(MIN(size, access_size_max), access_size_min);
|
|
access_mask = MAKE_64BIT_MASK(0, access_size * 8);
|
|
@@ -556,6 +570,9 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
|
|
access_mask, attrs);
|
|
}
|
|
}
|
|
+ if (dev) {
|
|
+ dev->mem_reentrancy_guard.engaged_in_io = false;
|
|
+ }
|
|
return r;
|
|
}
|
|
|
|
diff --git a/softmmu/trace-events b/softmmu/trace-events
|
|
index 22606dc27b..62d04ea9a7 100644
|
|
--- a/softmmu/trace-events
|
|
+++ b/softmmu/trace-events
|
|
@@ -13,6 +13,7 @@ memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, u
|
|
memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size, const char *name) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u name '%s'"
|
|
memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
|
|
memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
|
|
+memory_region_reentrant_io(int cpu_index, void *mr, uint64_t offset, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" size %u"
|
|
memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
|
|
memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
|
|
memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '%s' listener '%s' synced (global=%d)"
|