From 8951cb8dfb8dcf410a237656c1f9c9767e4a9e6c Mon Sep 17 00:00:00 2001 From: Alex Reece Date: Fri, 3 Apr 2015 14:14:28 +1100 Subject: [PATCH] Illumos 4873 - zvol unmap calls can take a very long time for larger datasets 4873 zvol unmap calls can take a very long time for larger datasets Author: Alex Reece Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Basil Crow Reviewed by: Dan McDonald Approved by: Robert Mustacchi References: https://www.illumos.org/issues/4873 https://github.com/illumos/illumos-gate/commit/0f6d88a Porting Notes: dbuf_free_range(): - reduce stack usage using kmem_alloc() - the sorted AVL tree will handle the spill block case correctly without all the special handling in the for() loop Ported-by: Chris Dunlop Signed-off-by: Brian Behlendorf --- include/sys/avl.h | 9 ++++++ include/sys/dbuf.h | 7 +++-- include/sys/dnode.h | 2 +- module/avl/avl.c | 30 ++++++++++++++++++ module/zfs/dbuf.c | 68 +++++++++++++++++++++++++---------------- module/zfs/dnode.c | 59 ++++++++++++++++++++++++++++------- module/zfs/dnode_sync.c | 12 +++----- 7 files changed, 138 insertions(+), 49 deletions(-) diff --git a/include/sys/avl.h b/include/sys/avl.h index ba305c908..9bc2b4a32 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + #ifndef _AVL_H #define _AVL_H @@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *); extern boolean_t avl_update_lt(avl_tree_t *, void *); extern boolean_t avl_update_gt(avl_tree_t *, void *); +/* + * Swaps the contents of the two trees. + */ +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + /* * Return the number of nodes in the tree */ diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 1eabfd7da..2f593bb4d 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -213,11 +213,14 @@ typedef struct dmu_buf_impl { /* pointer to most recent dirty record for this buffer */ dbuf_dirty_record_t *db_last_dirty; + /* Creation time of dbuf (see comment in dbuf_compare). */ + hrtime_t db_creation; + /* * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. */ - list_node_t db_link; + avl_node_t db_link; /* Data which is unique to data (leaf) blocks: */ diff --git a/include/sys/dnode.h b/include/sys/dnode.h index b63549b46..8a72722eb 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -233,7 +233,7 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* descendent dbufs */ + avl_tree_t dn_dbufs; /* descendent dbufs */ /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ diff --git a/module/avl/avl.c b/module/avl/avl.c index f9971da20..46bf49d8c 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + /* * AVL - generic AVL tree implementation for kernel use * @@ -85,6 +89,12 @@ * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". + * + * Note - in addition to userland (e.g. libavl and libutil) and the kernel + * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, + * which each have their own compilation environments and subsequent + * requirements. Each of these environments must be considered when adding + * dependencies from avl.c. */ #include @@ -863,6 +873,24 @@ avl_update(avl_tree_t *t, void *obj) return (B_FALSE); } +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + /* * initialize a new AVL tree */ @@ -1058,6 +1086,8 @@ EXPORT_SYMBOL(avl_first); EXPORT_SYMBOL(avl_last); EXPORT_SYMBOL(avl_nearest); EXPORT_SYMBOL(avl_add); +EXPORT_SYMBOL(avl_swap); +EXPORT_SYMBOL(avl_is_empty); EXPORT_SYMBOL(avl_remove); EXPORT_SYMBOL(avl_numnodes); EXPORT_SYMBOL(avl_destroy_nodes); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f5327a34a..e9c8580fc 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -93,7 +93,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); - list_link_init(&db->db_link); + + db->db_creation = gethrtime(); + return (0); } @@ -386,7 +388,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || - !list_is_empty(&dn->dn_dbufs)); + !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -866,23 +868,34 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * receive; see comment below for details. */ void -dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db, *db_next, *db_search; uint64_t txg = tx->tx_txg; + avl_index_t where; boolean_t freespill = - (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID); + (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID); - if (end > dn->dn_maxblkid && !freespill) - end = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + if (end_blkid > dn->dn_maxblkid && !freespill) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + + db_seach = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + db_search->db_level = 0; + db_search->db_blkid = start_blkid; + db_search->db_creation = 0; mutex_enter(&dn->dn_dbufs_mtx); - if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz && - !freespill) { + if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { /* There can't be any dbufs in this range; no need to search. */ - mutex_exit(&dn->dn_dbufs_mtx); - return; +#ifdef DEBUG + db = avl_find(&dn->dn_dbufs, db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + ASSERT(db == NULL || db->db_level > 0); +#endif + goto out; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in @@ -894,19 +907,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) atomic_inc_64(&zfs_free_range_recv_miss); } - for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); + db = avl_find(&dn->dn_dbufs, db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - /* Skip indirect blocks. */ - if (db->db_level != 0) - continue; - /* Skip direct blocks outside the range. */ - if (!freespill && (db->db_blkid < start || db->db_blkid > end)) - continue; - /* Skip all direct blocks, only free spill blocks. */ - if (freespill && (db->db_blkid != DMU_SPILL_BLKID)) - continue; + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); @@ -968,6 +980,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } + +out: + kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } @@ -1657,7 +1672,7 @@ dbuf_clear(dmu_buf_impl_t *db) dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); @@ -1829,7 +1844,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, mutex_exit(&dn->dn_dbufs_mtx); return (odb); } - list_insert_head(&dn->dn_dbufs, db); + avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; @@ -1888,7 +1903,7 @@ dbuf_destroy(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); @@ -1906,7 +1921,6 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; db->db_buf = NULL; - ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 815696f70..41722f25c 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -62,6 +62,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + if (d1->db_level < d2->db_level) { + return (-1); + } else if (d1->db_level > d2->db_level) { + return (1); + } + + if (d1->db_blkid < d2->db_blkid) { + return (-1); + } else if (d1->db_blkid > d2->db_blkid) { + return (1); + } + + /* + * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set + * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If + * another thread creates a dbuf of the same blkid before the dbuf is + * removed from dn_dbufs, we can reach a state where there are two + * dbufs of the same blkid and level in db_dbufs. To maintain the avl + * invariant that there cannot be duplicate items, we distinguish + * between these two dbufs based on the time they were created. + */ + if (d1->db_creation < d2->db_creation) { + return (-1); + } else if (d1->db_creation > d2->db_creation) { + return (1); + } else { + ASSERT3P(d1, ==, d2); + return (0); + } +} + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) @@ -116,7 +153,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; @@ -169,7 +206,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); + avl_destroy(&dn->dn_dbufs); } void @@ -503,7 +540,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); @@ -689,8 +726,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); - ASSERT(list_is_empty(&ndn->dn_dbufs)); - list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; @@ -724,7 +761,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; - list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; @@ -1238,7 +1275,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) return; } - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); @@ -1311,7 +1349,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db; int err; if (size == 0) @@ -1334,9 +1372,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 9779c51ef..4e613dd76 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -406,16 +406,13 @@ dnode_evict_dbufs(dnode_t *dn) int pass = 0; do { - dmu_buf_impl_t *db, marker; + dmu_buf_impl_t *db, *db_next; int evicting = FALSE; progress = FALSE; mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG DB_DNODE_ENTER(db); ASSERT3P(DB_DNODE(db), ==, dn); @@ -435,7 +432,6 @@ dnode_evict_dbufs(dnode_t *dn) } } - list_remove(&dn->dn_dbufs, &marker); /* * NB: we need to drop dn_dbufs_mtx between passes so * that any DB_EVICTING dbufs can make progress. @@ -516,7 +512,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); ASSERT3P(dn->dn_bonus, ==, NULL); /*