Fix zsb->z_hold_mtx deadlock

The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
serialize access to a znode and its SA buffer while the object is being
created or destroyed.  This kind of locking would normally reside in the
znode itself but in this case that's impossible because the znode and SA
buffer may not yet exist.  Therefore the locking is handled externally
with an array of mutexs and AVLs trees which contain per-object locks.

In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
in to the correct AVL tree and finally the per-object lock is held.  In
zfs_znode_hold_exit() the process is reversed.  The per-object lock is
released, removed from the AVL tree and destroyed if there are no waiters.

This scheme has two important properties:

1) No memory allocations are performed while holding one of the z_hold_locks.
   This ensures evict(), which can be called from direct memory reclaim, will
   never block waiting on a z_hold_locks which just happens to have hashed
   to the same index.

2) All locks used to serialize access to an object are per-object and never
   shared.  This minimizes lock contention without creating a large number
   of dedicated locks.

On the downside it does require znode_lock_t structures to be frequently
allocated and freed.  However, because these are backed by a kmem cache
and very short lived this cost is minimal.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4106
This commit is contained in:
Brian Behlendorf
2015-12-22 13:47:38 -08:00
parent 0720116d4d
commit c96c36fa22
4 changed files with 218 additions and 64 deletions
+18 -14
View File
@@ -663,7 +663,7 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
objset_t *os;
zfs_sb_t *zsb;
uint64_t zval;
int i, error;
int i, size, error;
uint64_t sa_obj;
zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_SLEEP);
@@ -685,8 +685,7 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
* Should probably make this a kmem cache, shuffle fields.
*/
zsb->z_sb = NULL;
zsb->z_parent = zsb;
@@ -795,12 +794,15 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
zsb->z_hold_mtx_size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
ZFS_OBJ_MTX_MAX);
zsb->z_hold_mtx = vmem_zalloc(sizeof (kmutex_t) * zsb->z_hold_mtx_size,
KM_SLEEP);
for (i = 0; i != zsb->z_hold_mtx_size; i++)
mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
zsb->z_hold_size = size;
zsb->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP);
zsb->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
for (i = 0; i != size; i++) {
avl_create(&zsb->z_hold_trees[i], zfs_znode_hold_compare,
sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
mutex_init(&zsb->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
}
*zsbp = zsb;
return (0);
@@ -809,7 +811,6 @@ out:
dmu_objset_disown(os, zsb);
*zsbp = NULL;
vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * zsb->z_hold_mtx_size);
kmem_free(zsb, sizeof (zfs_sb_t));
return (error);
}
@@ -901,7 +902,7 @@ EXPORT_SYMBOL(zfs_sb_setup);
void
zfs_sb_free(zfs_sb_t *zsb)
{
int i;
int i, size = zsb->z_hold_size;
zfs_fuid_destroy(zsb);
@@ -911,9 +912,12 @@ zfs_sb_free(zfs_sb_t *zsb)
rrm_destroy(&zsb->z_teardown_lock);
rw_destroy(&zsb->z_teardown_inactive_lock);
rw_destroy(&zsb->z_fuid_lock);
for (i = 0; i != zsb->z_hold_mtx_size; i++)
mutex_destroy(&zsb->z_hold_mtx[i]);
vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * zsb->z_hold_mtx_size);
for (i = 0; i != size; i++) {
avl_destroy(&zsb->z_hold_trees[i]);
mutex_destroy(&zsb->z_hold_locks[i]);
}
vmem_free(zsb->z_hold_trees, sizeof (avl_tree_t) * size);
vmem_free(zsb->z_hold_locks, sizeof (kmutex_t) * size);
zfs_mntopts_free(zsb->z_mntopts);
kmem_free(zsb, sizeof (zfs_sb_t));
}