Illumos 5056 - ZFS deadlock on db_mtx and dn_holds

5056 ZFS deadlock on db_mtx and dn_holds
Author: Justin Gibbs <justing@spectralogic.com>
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5056
  https://github.com/illumos/illumos-gate/commit/bc9014e

Porting Notes:

sa_handle_get_from_db():
  - the original patch includes an otherwise unmentioned fix for a
    possible usage of an uninitialised variable

dmu_objset_open_impl():
  - Under Illumos list_link_init() is the same as filling a list_node_t
    with NULLs, so they don't notice if they miss doing list_link_init()
    on a zero'd containing structure (e.g. allocated with kmem_zalloc as
    here). Under Linux, not so much: an uninitialised list_node_t goes
    "Boom!" some time later when it's used or destroyed.

dmu_objset_evict_dbufs():
  - reduce stack usage using kmem_alloc()

Ported-by: Chris Dunlop <chris@onthe.net.au>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Justin T. Gibbs
2015-04-02 14:44:32 +11:00
committed by Brian Behlendorf
parent d683ddbb72
commit 0c66c32d1d
35 changed files with 645 additions and 316 deletions
+28 -40
View File
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -402,53 +403,41 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
void
dnode_evict_dbufs(dnode_t *dn)
{
int progress;
int pass = 0;
dmu_buf_impl_t *db_marker;
dmu_buf_impl_t *db, *db_next;
do {
dmu_buf_impl_t *db, *db_next;
int evicting = FALSE;
db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
mutex_enter(&dn->dn_dbufs_mtx);
for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
progress = FALSE;
mutex_enter(&dn->dn_dbufs_mtx);
for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
db_next = AVL_NEXT(&dn->dn_dbufs, db);
#ifdef DEBUG
DB_DNODE_ENTER(db);
ASSERT3P(DB_DNODE(db), ==, dn);
DB_DNODE_EXIT(db);
DB_DNODE_ENTER(db);
ASSERT3P(DB_DNODE(db), ==, dn);
DB_DNODE_EXIT(db);
#endif /* DEBUG */
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
progress = TRUE;
evicting = TRUE;
mutex_exit(&db->db_mtx);
} else if (refcount_is_zero(&db->db_holds)) {
progress = TRUE;
dbuf_clear(db); /* exits db_mtx for us */
} else {
mutex_exit(&db->db_mtx);
}
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING &&
refcount_is_zero(&db->db_holds)) {
db_marker->db_level = db->db_level;
db_marker->db_blkid = db->db_blkid;
db_marker->db_state = DB_SEARCH;
avl_insert_here(&dn->dn_dbufs, db_marker, db,
AVL_BEFORE);
dbuf_clear(db);
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
avl_remove(&dn->dn_dbufs, db_marker);
} else {
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
/*
* NB: we need to drop dn_dbufs_mtx between passes so
* that any DB_EVICTING dbufs can make progress.
* Ideally, we would have some cv we could wait on, but
* since we don't, just wait a bit to give the other
* thread a chance to run.
*/
mutex_exit(&dn->dn_dbufs_mtx);
if (evicting)
delay(1);
pass++;
if ((pass % 100) == 0)
dprintf("Exceeded %d passes evicting dbufs\n", pass);
} while (progress);
}
mutex_exit(&dn->dn_dbufs_mtx);
if (pass >= 100)
dprintf("Required %d passes to evict dbufs\n", pass);
kmem_free(db_marker, sizeof (dmu_buf_impl_t));
dnode_evict_bonus(dn);
}
@@ -513,7 +502,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
ASSERT(avl_is_empty(&dn->dn_dbufs));
ASSERT3P(dn->dn_bonus, ==, NULL);
/*
* XXX - It would be nice to assert this, but we may still