From 39efbde7c551ae0edcd57db3aab28fd7f2d29d18 Mon Sep 17 00:00:00 2001 From: George Melikov Date: Fri, 27 Jan 2017 01:43:28 +0300 Subject: [PATCH] OpenZFS 6676 - Race between unique_insert() and unique_remove() causes ZFS fsid change Authored by: Josef 'Jeff' Sipek Reviewed by: Saso Kiselkov Reviewed by: Sanjay Nadkarni Reviewed by: Dan Vatca Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Sebastien Roy Approved by: Robert Mustacchi Reviewed-by: Brian Behlendorf Ported-by: George Melikov OpenZFS-issue: https://www.illumos.org/issues/6676 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/40510e8 Closes #5667 --- include/sys/dmu.h | 24 +++++++++++++++++------- include/sys/zap_impl.h | 2 +- module/zfs/dbuf.c | 26 +++++++++++++++++++++----- module/zfs/dnode.c | 6 +++--- module/zfs/dsl_dataset.c | 35 ++++++++++++++++++++++++++++++----- module/zfs/dsl_dir.c | 7 ++++--- module/zfs/sa.c | 10 ++++++---- module/zfs/zap.c | 11 ++++++----- module/zfs/zap_micro.c | 4 ++-- 9 files changed, 90 insertions(+), 35 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 3c57de32e..9c8ca7c36 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -549,8 +549,14 @@ typedef struct dmu_buf_user { */ taskq_ent_t dbu_tqent; - /* This instance's eviction function pointer. */ - dmu_buf_evict_func_t *dbu_evict_func; + /* + * This instance's eviction function pointers. + * + * dbu_evict_func_sync is called synchronously and then + * dbu_evict_func_async is executed asynchronously on a taskq. + */ + dmu_buf_evict_func_t *dbu_evict_func_sync; + dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do @@ -572,12 +578,16 @@ typedef struct dmu_buf_user { */ /*ARGSUSED*/ static inline void -dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, - dmu_buf_t **clear_on_evict_dbufp) +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) { - ASSERT(dbu->dbu_evict_func == NULL); - ASSERT(evict_func != NULL); - dbu->dbu_evict_func = evict_func; + ASSERT(dbu->dbu_evict_func_sync == NULL); + ASSERT(dbu->dbu_evict_func_async == NULL); + + /* must have at least one evict func */ + IMPLY(evict_func_sync == NULL, evict_func_async != NULL); + dbu->dbu_evict_func_sync = evict_func_sync; + dbu->dbu_evict_func_async = evict_func_async; taskq_init_ent(&dbu->dbu_tqent); #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 13a208faf..cbe7f3c5b 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -198,7 +198,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); void zap_unlockdir(zap_t *zap, void *tag); -void zap_evict(void *dbu); +void zap_evict_sync(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index ca1a44303..907a84941 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -85,7 +85,9 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); #ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, - dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); + dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp); #endif /* ! __lint */ /* @@ -385,6 +387,7 @@ static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; + boolean_t has_async; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -400,11 +403,24 @@ dbuf_evict_user(dmu_buf_impl_t *db) #endif /* - * Invoke the callback from a taskq to avoid lock order reversals - * and limit stack depth. + * There are two eviction callbacks - one that we call synchronously + * and one that we invoke via a taskq. The async one is useful for + * avoiding lock order reversals and limiting stack depth. + * + * Note that if we have a sync callback but no async callback, + * it's likely that the sync callback will free the structure + * containing the dbu. In that case we need to take care to not + * dereference dbu after calling the sync evict func. */ - taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, - &dbu->dbu_tqent); + has_async = (dbu->dbu_evict_func_async != NULL); + + if (dbu->dbu_evict_func_sync != NULL) + dbu->dbu_evict_func_sync(dbu); + + if (has_async) { + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, + dbu, 0, &dbu->dbu_tqent); + } } boolean_t diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index cbd54be04..4c37ce383 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1021,7 +1021,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, } static void -dnode_buf_pageout(void *dbu) +dnode_buf_evict_async(void *dbu) { dnode_children_t *children_dnodes = dbu; int i; @@ -1271,8 +1271,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); } - dmu_buf_init_user(&children_dnodes->dnc_dbu, - dnode_buf_pageout, NULL); + dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, + dnode_buf_evict_async, NULL); winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); if (winner != NULL) { diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index ddab3eddb..fb6feeec9 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -273,8 +273,24 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, return (B_TRUE); } +/* + * We have to release the fsid syncronously or we risk that a subsequent + * mount of the same dataset will fail to unique_insert the fsid. This + * failure would manifest itself as the fsid of this dataset changing + * between mounts which makes NFS clients quite unhappy. + */ static void -dsl_dataset_evict(void *dbu) +dsl_dataset_evict_sync(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + unique_remove(ds->ds_fsid_guid); +} + +static void +dsl_dataset_evict_async(void *dbu) { dsl_dataset_t *ds = dbu; @@ -282,8 +298,6 @@ dsl_dataset_evict(void *dbu) ds->ds_dbuf = NULL; - unique_remove(ds->ds_fsid_guid); - if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); @@ -525,7 +539,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } - dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, + dsl_dataset_evict_async, &ds->ds_dbuf); if (err == 0) winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); @@ -548,6 +563,16 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } else { ds->ds_fsid_guid = unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); + if (ds->ds_fsid_guid != + dsl_dataset_phys(ds)->ds_fsid_guid) { + zfs_dbgmsg("ds_fsid_guid changed from " + "%llx to %llx for pool %s dataset id %llu", + (long long) + dsl_dataset_phys(ds)->ds_fsid_guid, + (long long)ds->ds_fsid_guid, + spa_name(dp->dp_spa), + dsobj); + } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b21735235..305a87ed9 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -129,7 +129,7 @@ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); static void -dsl_dir_evict(void *dbu) +dsl_dir_evict_async(void *dbu) { dsl_dir_t *dd = dbu; int t; @@ -237,7 +237,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_buf_rele(origin_bonus, FTAG); } - dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, + &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); if (winner != NULL) { if (dd->dd_parent) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index aa8f3192d..dda51529c 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -1297,7 +1297,7 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) /*ARGSUSED*/ static void -sa_evict(void *dbu) +sa_evict_sync(void *dbu) { panic("evicting sa dbuf\n"); } @@ -1394,7 +1394,8 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_t *winner = NULL; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); - handle->sa_dbu.dbu_evict_func = NULL; + handle->sa_dbu.dbu_evict_func_sync = NULL; + handle->sa_dbu.dbu_evict_func_async = NULL; handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; @@ -1405,7 +1406,8 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, error = sa_build_index(handle, SA_BONUS); if (hdl_type == SA_HDL_SHARED) { - dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL); + dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, + NULL); winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); } diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 17107eb28..907ab2aa5 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -81,7 +81,8 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - zap->zap_dbu.dbu_evict_func = zap_evict; + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; @@ -399,7 +400,7 @@ zap_allocate_blocks(zap_t *zap, int nblocks) } static void -zap_leaf_pageout(void *dbu) +zap_leaf_evict_sync(void *dbu) { zap_leaf_t *l = dbu; @@ -423,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -470,13 +471,13 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(&l->l_dbu); + zap_leaf_evict_sync(&l->l_dbu); l = winner; } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 97c0fff65..cf552ed10 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -392,7 +392,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf); + dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) @@ -774,7 +774,7 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) } void -zap_evict(void *dbu) +zap_evict_sync(void *dbu) { zap_t *zap = dbu;