From 95319fc569cf1ab322926f037b92dd4fd15b5630 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 27 Aug 2019 12:55:51 -0400 Subject: [PATCH] Fix deadlock in 'zfs rollback' Currently, the 'zfs rollback' code can end up deadlocked due to the way the kernel handles unreferenced inodes on a suspended fs. Essentially, the zfs_resume_fs() code path may cause zfs to spawn new threads as it reinstantiates the suspended fs's zil. When a new thread is spawned, the kernel may attempt to free memory for that thread by freeing some unreferenced inodes. If it happens to select inodes that are a a part of the suspended fs a deadlock will occur because freeing inodes requires holding the fs's z_teardown_inactive_lock which is still held from the suspend. This patch corrects this issue by adding an additional reference to all inodes that are still present when a suspend is initiated. This prevents them from being freed by the kernel for any reason. Reviewed-by: Alek Pinchuk Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #9203 --- include/sys/zfs_znode.h | 1 + module/zfs/zfs_vfsops.c | 16 +++++++++++++++- module/zfs/zfs_znode.c | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index add45a7f4..01b358cc4 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -196,6 +196,7 @@ typedef struct znode { uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ uint8_t z_moved; /* Has this znode been moved? */ + boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 371c412f6..489f12b7f 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1736,7 +1736,12 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * - * Release all holds on dbufs. + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); @@ -1744,6 +1749,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); + if (igrab(ZTOI(zp)) != NULL) + zp->z_suspended = B_TRUE; + } mutex_exit(&zfsvfs->z_znodes_lock); } @@ -2192,6 +2200,12 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + zfs_iput_async(ZTOI(zp)); + zp->z_suspended = B_FALSE; + } } mutex_exit(&zfsvfs->z_znodes_lock); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 3dd299942..91162e857 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -540,6 +540,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_moved = 0; + zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0;