diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index a8b5418b6..8a21bf378 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -200,7 +200,8 @@ extern uint64_t metaslab_df_alloc_threshold; extern unsigned long zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; -extern int zfs_abd_scatter_enabled; +extern int zfs_abd_scatter_enabled; +extern int dmu_object_alloc_chunk_shift; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -314,6 +315,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate; ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; @@ -361,6 +363,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), ZTI_INIT(ztest_zap, 30, &zopt_always), ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), @@ -4055,6 +4058,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) umem_free(od, size); } +/* + * Rewind the global allocator to verify object allocation backfilling. + */ +void +ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint64_t object; + + /* + * Rewind the global allocator randomly back to a lower object number + * to force backfilling and reclamation of recently freed dnodes. + */ + mutex_enter(&os->os_obj_lock); + object = ztest_random(os->os_obj_next_chunk); + os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + mutex_exit(&os->os_obj_lock); +} + #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 diff --git a/include/sys/dnode.h b/include/sys/dnode.h index a2bef9d2c..691fd443a 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -424,6 +424,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); +void dnode_free_interior_slots(dnode_t *dn); #define DNODE_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ @@ -517,6 +518,11 @@ typedef struct dnode_stats { * which had already been unlinked in an earlier txg. */ kstat_named_t dnode_hold_free_txg; + /* + * Number of times dnode_free_interior_slots() needed to retry + * acquiring a slot zrl lock due to contention. + */ + kstat_named_t dnode_free_interior_lock_retry; /* * Number of new dnodes allocated by dnode_allocate(). */ diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index e7412b750..f53da407f 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, return (err); } - int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 63a4f98bf..2c2ed8fb3 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2455,10 +2455,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } err = dmu_object_info(rwa->os, drro->drr_object, &doi); - - if (err != 0 && err != ENOENT) + if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); - object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; @@ -2476,6 +2474,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, int nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); + object = drro->drr_object; + /* nblkptr will be bounded by the bonus size and type */ if (rwa->raw && nblkptr != drro->drr_nblkptr) return (SET_ERROR(EINVAL)); @@ -2484,18 +2484,89 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr || indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection)) { + drro->drr_nlevels < doi.doi_indirection || + drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT)) { err = dmu_free_long_range_raw(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } else if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr) { + nblkptr < doi.doi_nblkptr || + drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } + + /* + * The dmu does not currently support decreasing nlevels + * on an object. For non-raw sends, this does not matter + * and the new object can just use the previous one's nlevels. + * For raw sends, however, the structure of the received dnode + * (including nlevels) must match that of the send side. + * Therefore, instead of using dmu_object_reclaim(), we must + * free the object completely and call dmu_object_claim_dnsize() + * instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || + drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { + if (rwa->raw) { + err = dmu_free_long_object_raw(rwa->os, + drro->drr_object); + } else { + err = dmu_free_long_object(rwa->os, + drro->drr_object); + } + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = DMU_NEW_OBJECT; + } + } else if (err == EEXIST) { + /* + * The object requested is currently an interior slot of a + * multi-slot dnode. This will be resolved when the next txg + * is synced out, since the send stream will have told us + * to free this slot when we freed the associated dnode + * earlier in the stream. + */ + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = drro->drr_object; + } else { + /* object is free and we are about to allocate a new one */ + object = DMU_NEW_OBJECT; + } + + /* + * If this is a multi-slot dnode there is a chance that this + * object will expand into a slot that is already used by + * another object from the previous snapshot. We must free + * these objects before we attempt to allocate the new dnode. + */ + if (drro->drr_dn_slots > 1) { + for (uint64_t slot = drro->drr_object + 1; + slot < drro->drr_object + drro->drr_dn_slots; + slot++) { + dmu_object_info_t slot_doi; + + err = dmu_object_info(rwa->os, slot, &slot_doi); + if (err == ENOENT || err == EEXIST) + continue; + else if (err != 0) + return (err); + + if (rwa->raw) + err = dmu_free_long_object_raw(rwa->os, slot); + else + err = dmu_free_long_object(rwa->os, slot); + + if (err != 0) + return (err); + } + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); @@ -2849,6 +2920,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, dmu_tx_abort(tx); return (err); } + if (rwa->raw) { VERIFY0(dmu_object_dirty_raw(rwa->os, drrs->drr_object, tx)); dmu_buf_will_change_crypt_params(db_spill, tx); @@ -3199,7 +3271,7 @@ receive_read_record(struct receive_arg *ra) * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ - if (err == ENOENT || + if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 544e736d8..b4c131e98 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = { { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, + { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_allocate", KSTAT_DATA_UINT64 }, { "dnode_reallocate", KSTAT_DATA_UINT64 }, { "dnode_buf_evict", KSTAT_DATA_UINT64 }, @@ -518,7 +519,8 @@ dnode_destroy(dnode_t *dn) mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ - zrl_remove(&dn->dn_handle->dnh_zrlock); + if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) + zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; @@ -665,6 +667,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; + + dnode_free_interior_slots(dn); DNODE_STAT_BUMP(dnode_reallocate); /* clean up any unreferenced dbufs */ @@ -1067,19 +1071,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) } static boolean_t -dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr) +dnode_check_slots_free(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; - if (dnh->dnh_dnode != ptr) + dnode_t *dn = dnh->dnh_dnode; + + if (dn == DN_SLOT_FREE) { + continue; + } else if (DN_SLOT_IS_PTR(dn)) { + mutex_enter(&dn->dn_mtx); + dmu_object_type_t type = dn->dn_type; + mutex_exit(&dn->dn_mtx); + + if (type != DMU_OT_NONE) + return (B_FALSE); + + continue; + } else { return (B_FALSE); + } + + return (B_FALSE); } return (B_TRUE); } +static void +dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); + dnode_destroy(dnh->dnh_dnode); + dnh->dnh_dnode = DN_SLOT_FREE; + } + } +} + +void +dnode_free_interior_slots(dnode_t *dn) +{ + dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); + int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; + int idx = (dn->dn_object & (epb - 1)) + 1; + int slots = dn->dn_num_slots - 1; + + if (slots == 0) + return; + + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + while (!dnode_slots_tryenter(children, idx, slots)) + DNODE_STAT_BUMP(dnode_free_interior_lock_retry); + + dnode_set_slots(children, idx, slots, DN_SLOT_FREE); + dnode_slots_rele(children, idx, slots); +} + void dnode_special_close(dnode_handle_t *dnh) { @@ -1377,7 +1435,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, while (dn == DN_SLOT_UNINIT) { dnode_slots_hold(dnc, idx, slots); - if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) { + if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_misses); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); @@ -1390,15 +1448,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, continue; } - if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) { + if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_misses); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } + /* + * Allocated but otherwise free dnodes which would + * be in the interior of a multi-slot dnodes need + * to be freed. Single slot dnodes can be safely + * re-purposed as a performance optimization. + */ + if (slots > 1) + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + dnh = &dnc->dnc_children[idx]; - dn = dnode_create(os, dn_block + idx, db, object, dnh); + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } } mutex_enter(&dn->dn_mtx); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 09437993a..7d3850a5f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -529,6 +529,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + dnode_free_interior_slots(dn); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; @@ -536,6 +537,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_have_spill = B_FALSE; + dn->dn_num_slots = 1; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a62dd352c..16c10d41a 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -657,7 +657,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy', - 'send_freeobjects'] + 'send_freeobjects', 'send_realloc_dnode_size'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index 6598b1c3d..7a8b8a33a 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -40,4 +40,5 @@ dist_pkgdata_SCRIPTS = \ send-c_volume.ksh \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ - send_freeobjects.ksh + send_freeobjects.ksh \ + send_realloc_dnode_size.ksh diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh new file mode 100755 index 000000000..206763949 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify incremental receive properly handles objects with changed +# dnode slot count. +# +# Strategy: +# 1. Populate a dataset with 1k byte dnodes and snapshot +# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects +# get recycled numbers and formerly "interior" dnode slots get assigned +# to new objects +# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects +# overlap with recently recycled and formerly "normal" dnode slots get +# assigned to new objects +# 4. Generate initial and incremental streams +# 5. Verify initial and incremental streams can be received +# + +verify_runnable "both" + +log_assert "Verify incremental receive handles objects with changed dnode size" + +function cleanup +{ + rm -f $BACKDIR/fs-dn-legacy + rm -f $BACKDIR/fs-dn-1k + rm -f $BACKDIR/fs-dn-2k + + if datasetexists $POOL/fs ; then + log_must zfs destroy -rR $POOL/fs + fi + + if datasetexists $POOL/newfs ; then + log_must zfs destroy -rR $POOL/newfs + fi +} + +log_onexit cleanup + +# 1. Populate a dataset with 1k byte dnodes and snapshot +log_must zfs create -o dnodesize=1k $POOL/fs +log_must mk_files 200 262144 0 $POOL/fs +log_must zfs snapshot $POOL/fs@a + +# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects +# get recycled numbers and formerly "interior" dnode slots get assigned +# to new objects +rm /$POOL/fs/* + +log_must zfs unmount $POOL/fs +log_must zfs set dnodesize=legacy $POOL/fs +log_must zfs mount $POOL/fs + +log_must mk_files 200 262144 0 $POOL/fs +log_must zfs snapshot $POOL/fs@b + +# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects +# overlap with recently recycled and formerly "normal" dnode slots get +# assigned to new objects +rm /$POOL/fs/* + +log_must zfs unmount $POOL/fs +log_must zfs set dnodesize=2k $POOL/fs +log_must zfs mount $POOL/fs + +mk_files 200 262144 0 $POOL/fs +log_must zfs snapshot $POOL/fs@c + +# 4. Generate initial and incremental streams +log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k" +log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy" +log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k" + +# 5. Verify initial and incremental streams can be received +log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k" +log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy" +log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k" + +log_pass "Verify incremental receive handles objects with changed dnode size"