From 460d887c439079422b642da87a77dbb896f5e64a Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 24 Mar 2023 13:27:07 -0400 Subject: [PATCH] panic loop when removing slog device There is a window in the slog removal code where a panic loop could ensue if the system crashes during that operation. The original design of slog removal did not persisted any state because the removal happened synchronously. This was changed by a later commit which persisted the vdev_removing flag and exposed this bug. If a slog removal is in progress and happens to crash after persisting the vdev_removing flag to the label but before the vdev is removed from the spa config, then the pool will continue to panic on import. Here's a sample of the panic: [ 134.387411] VERIFY0(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)) failed (0 == 22) [ 134.393865] PANIC at dmu.c:1135:dmu_write() [ 134.396035] Kernel panic - not syncing: VERIFY0(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)) failed (0 == 22) [ 134.397857] CPU: 2 PID: 5914 Comm: txg_sync Kdump: loaded Tainted: P OE 5.4.0-1100-dx2023020205-b3751f8c2-azure #106 [ 134.407938] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 12/07/2018 [ 134.407938] Call Trace: [ 134.407938] dump_stack+0x57/0x6d [ 134.407938] panic+0xfb/0x2d7 [ 134.407938] spl_panic+0xcf/0x102 [spl] [ 134.407938] ? traverse_impl+0x1ca/0x420 [zfs] [ 134.407938] ? dmu_object_alloc_impl+0x3b4/0x3c0 [zfs] [ 134.407938] ? dnode_hold+0x1b/0x20 [zfs] [ 134.407938] dmu_write+0xc3/0xd0 [zfs] [ 134.407938] ? space_map_alloc+0x55/0x80 [zfs] [ 134.407938] metaslab_sync+0x61a/0x830 [zfs] [ 134.407938] ? queued_spin_unlock+0x9/0x10 [zfs] [ 134.407938] vdev_sync+0x72/0x190 [zfs] [ 134.407938] spa_sync_iterate_to_convergence+0x160/0x250 [zfs] [ 134.407938] spa_sync+0x2f7/0x670 [zfs] [ 134.407938] txg_sync_thread+0x22d/0x2d0 [zfs] [ 134.407938] ? txg_dispatch_callbacks+0xf0/0xf0 [zfs] [ 134.407938] thread_generic_wrapper+0x83/0xa0 [spl] [ 134.407938] kthread+0x104/0x140 [ 134.407938] ? kasan_check_write.constprop.0+0x10/0x10 [spl] [ 134.407938] ? kthread_park+0x90/0x90 [ 134.457802] ret_from_fork+0x1f/0x40 This change no longer persists the vdev_removing flag when removing slog devices and also cleans up some code that was added which is not used. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Mark Maybee Signed-off-by: George Wilson Closes #14652 --- include/sys/vdev.h | 5 ++--- module/zfs/vdev_label.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7a7c70dc1..d529bbcdd 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -186,9 +186,8 @@ extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, - VDEV_CONFIG_REMOVING = 1 << 2, - VDEV_CONFIG_MOS = 1 << 3, - VDEV_CONFIG_MISSING = 1 << 4 + VDEV_CONFIG_MOS = 1 << 2, + VDEV_CONFIG_MISSING = 1 << 3 } vdev_config_flag_t; extern void vdev_post_kobj_evt(vdev_t *vd); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d81bc29f2..f61be65a2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -500,7 +500,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, vd->vdev_noalloc); } - if (vd->vdev_removing) { + + /* + * Slog devices are removed synchronously so don't + * persist the vdev_removing flag to the label. + */ + if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } @@ -644,35 +649,22 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c, idx; + uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0, idx = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - /* - * If we're generating an nvlist of removing - * vdevs then skip over any device which is - * not being removed. - */ - if ((flags & VDEV_CONFIG_REMOVING) && - !cvd->vdev_removing) - continue; - - child[idx++] = vdev_config_generate(spa, cvd, + for (c = 0; c < vd->vdev_children; c++) { + child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } - if (idx) { - fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - (const nvlist_t * const *)child, idx); - } + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t * const *)child, vd->vdev_children); - for (c = 0; c < idx; c++) + for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));