From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 10 Apr 2018 14:15:05 -0400 Subject: [PATCH] Fix race in dnode_check_slots_free() Currently, dnode_check_slots_free() works by checking dn->dn_type in the dnode to determine if the dnode is reclaimable. However, there is a small window of time between dnode_free_sync() in the first call to dsl_dataset_sync() and when the useraccounting code is run when the type is set DMU_OT_NONE, but the dnode is not yet evictable, leading to crashes. This patch adds the ability for dnodes to track which txg they were last dirtied in and adds a check for this before performing the reclaim. This patch also corrects several instances when dn_dirty_link was treated as a list_node_t when it is technically a multilist_node_t. Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Requires-spl: spl-0.7-release Issue #7147 Issue #7388 Issue #7997 (cherry-picked from behlendorf/issue-7997 4764f6f3be90be073d2700653dff286371e52583) Signed-off-by: Stoiko Ivanov --- include/sys/dmu_impl.h | 1 + include/sys/dnode.h | 4 ++++ module/zfs/dbuf.c | 3 +++ module/zfs/dmu.c | 2 +- module/zfs/dmu_objset.c | 15 +++++++++++++++ module/zfs/dnode.c | 29 +++++++++++++++++++---------- 6 files changed, 43 insertions(+), 11 deletions(-) diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 65e417e3..03a63077 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -161,6 +161,7 @@ extern "C" { * dn_allocated_txg * dn_free_txg * dn_assigned_txg + * dn_dirty_txg * dd_assigned_tx * dn_notxholds * dn_dirtyctx diff --git a/include/sys/dnode.h b/include/sys/dnode.h index ea7defe1..2dd087b3 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -260,6 +260,7 @@ struct dnode { uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ kcondvar_t dn_notxholds; enum dnode_dirtycontext dn_dirtyctx; uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ @@ -362,6 +363,9 @@ void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); void dnode_free_interior_slots(dnode_t *dn); +#define DNODE_IS_DIRTY(_dn) \ + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) + #define DNODE_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (DMU_OT_IS_METADATA((_dn)->dn_type) && \ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4ee121f5..6edb39d6 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1606,6 +1606,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) FTAG); } } + + if (tx->tx_txg > dn->dn_dirty_txg) + dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 6f09aa2f..a09ac4f9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2044,7 +2044,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) * Check if dnode is dirty */ for (i = 0; i < TXG_SIZE; i++) { - if (list_link_active(&dn->dn_dirty_link[i])) { + if (multilist_link_active(&dn->dn_dirty_link[i])) { clean = B_FALSE; break; } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 449ebedf..0bed2d3e 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1213,10 +1213,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); multilist_sublist_remove(list, dn); + /* + * If we are not doing useraccounting (os_synced_dnodes == NULL) + * we are done with this dnode for this txg. Unset dn_dirty_txg + * if later txgs aren't dirtying it so that future holders do + * not get a stale value. Otherwise, we will do this in + * userquota_updates_task() when processing has completely + * finished for this txg. + */ multilist_t *newlist = dn->dn_objset->os_synced_dnodes; if (newlist != NULL) { (void) dnode_add_ref(dn, newlist); multilist_insert(newlist, dn); + } else { + mutex_enter(&dn->dn_mtx); + if (dn->dn_dirty_txg == tx->tx_txg) + dn->dn_dirty_txg = 0; + mutex_exit(&dn->dn_mtx); } dnode_sync(dn, tx); @@ -1621,6 +1634,8 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) + dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index d465b545..4a169c49 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -137,7 +137,7 @@ dnode_cons(void *arg, void *unused, int kmflag) bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { - list_link_init(&dn->dn_dirty_link[i]); + multilist_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), @@ -147,6 +147,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; @@ -184,7 +185,7 @@ dnode_dest(void *arg, void *unused) ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); @@ -199,6 +200,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirty_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); @@ -523,6 +525,7 @@ dnode_destroy(dnode_t *dn) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { @@ -592,6 +595,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirty_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); @@ -604,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } @@ -779,6 +783,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirty_txg = odn->dn_dirty_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); @@ -845,6 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; + odn->dn_dirty_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; @@ -1069,6 +1075,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + /* + * If all dnode slots are either already free or + * evictable return B_TRUE. + */ for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; dnode_t *dn = dnh->dnh_dnode; @@ -1077,18 +1087,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) continue; } else if (DN_SLOT_IS_PTR(dn)) { mutex_enter(&dn->dn_mtx); - dmu_object_type_t type = dn->dn_type; + boolean_t can_free = (dn->dn_type == DMU_OT_NONE && + !DNODE_IS_DIRTY(dn)); mutex_exit(&dn->dn_mtx); - if (type != DMU_OT_NONE) + if (!can_free) return (B_FALSE); - - continue; + else + continue; } else { return (B_FALSE); } - - return (B_FALSE); } return (B_TRUE); @@ -1594,7 +1603,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) /* * If we are already marked dirty, we're done. */ - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { multilist_sublist_unlock(mls); return; }